PreProcessing Doublets inference

Doublet inference was done using two packages : Scrublet and DoubletDetection. These processing steps load the each of the 35 datasets, performs low quality cells filtering then infer the presence of doublets. The cells inferred status (doublet or singlet) is stored in the metadata of each dataset. The datasets are then normalized to 10000 UMIs and aggregated to enabled a first look analysis of the doublet inference results.

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import scrublet as scr
import doubletdetection

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
scanpy==1.4+39.gc70f24b anndata==0.6.18 numpy==1.16.2 scipy==1.2.1 pandas==0.24.1 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 
In [2]:
def remove_RB_genes(
    df,
    path_to_RB_genes_file = '/home/deprez/HCA/PeerLab_analysis/RB_genes'
):
    """Removes all columns of RB genes as listed in the RB gene file.
    RB_genes_file should contain gene names, one gene name per line.
    Returns RB gene-depleted df, pd.Series with number of counts removed 
    per cell, and a list of RB genes that were in the df."""
    with open(path_to_RB_genes_file,'r') as file:
        lines = file.readlines()
    genes = [x.rstrip('\n') for x in lines]
    RB_genes_in_df = []
    df_genes = df.columns
    for gene in genes:
        if gene in df_genes:
            RB_genes_in_df.append(gene)
    # df_genes = df.columns
    # ribosomal_genes = []
    # for GENE in df_genes:
    #     if GENE[:3] in ['RPL','RPS']:
    #         ribosomal_genes.append(GENE)
    # store the number of RB molecules per cell in a Series object with cell labels as indices
    counts_removed_per_cell = pd.Series(index = df.index)
    for cell in df.index:
        counts_removed_per_cell[cell] = sum(df.loc[cell][RB_genes_in_df])
    # now drop all columns with RB genes:
    df_RB_depleted = df.drop(columns = RB_genes_in_df)
    return df_RB_depleted, counts_removed_per_cell, RB_genes_in_df
In [3]:
sc.settings.set_figure_params(dpi=80)
In [4]:
os.chdir('/home/deprez/HCA/Data/')
outsPath = 'outs/filtered_gene_bc_matrices/ucagenomix-cellranger-hg19-1.3.0/'

Nasal Biopsies

Back to top

In [5]:
D322_Biop_Nas1 = sc.read_10x_mtx(
    './D322_Biop_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D322_Biop_Nas1.var_names_make_unique()
D322_Biop_Nas1.obs['manip'] = 'D322_Biop_Nas1'
D322_Biop_Nas1.obs['position'] = 'Nasal'
D322_Biop_Nas1.obs['method'] = 'Biopsy'
D322_Biop_Nas1.obs['donor'] = 'D322'
D322_Biop_Nas1.obs['name'] = ['D322_Biop_Nas1_' + s for s in list(D322_Biop_Nas1.obs.index)]
D322_Biop_Nas1.obs_names = D322_Biop_Nas1.obs['name']
D322_Biop_Nas1
... reading from cache file ./cache/D322_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[5]:
AnnData object with n_obs × n_vars = 1797 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [6]:
sc.pl.highest_expr_genes(D322_Biop_Nas1, n_top=20)
In [7]:
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=0)
mito_genes = D322_Biop_Nas1.var_names.str.startswith('MT-')
D322_Biop_Nas1.obs['percent_mito'] = np.sum(
    D322_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.obs['n_counts'] = D322_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Nas1.to_df())
ribo_genes = D322_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Nas1.obs['percent_ribo'] = np.sum(
    D322_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Nas1.X, axis=1).A1
D322_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D322_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [8]:
sc.pp.filter_cells(D322_Biop_Nas1, min_genes=500)
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['n_counts'] < 40000, :]
D322_Biop_Nas1 = D322_Biop_Nas1[D322_Biop_Nas1.obs['percent_mito'] < 0.2, :]
filtered out 10 cells that have less than 500 genes expressed
In [9]:
D322_Biop_Nas1.shape
Out[9]:
(1780, 32739)
In [10]:
# scrublet
scrub = scr.Scrublet(D322_Biop_Nas1.X, expected_doublet_rate=0.016) 
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D322_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram();
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.17
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 9.6%
Overall doublet rate:
	Expected   = 1.6%
	Estimated  = 4.7%
Elapsed time: 1.4 seconds
In [11]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Nas1.X).predict()
D322_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4085679054260254 seconds
Jaccard graph constructed in 0.3359701633453369 seconds
Wrote graph to binary file in 0.1134796142578125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851025
After 2 runs, maximum modularity is Q = 0.852223
Louvain completed 22 runs in 0.776637077331543 seconds
PhenoGraph complete in 1.6490111351013184 seconds
Found communities [-1, ... 12], with sizes: [132, 489, 246, 245, 199, 178, 157, 146, 131, 84, 74, 66, 59, 19]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40769004821777344 seconds
Jaccard graph constructed in 0.29503822326660156 seconds
Wrote graph to binary file in 0.03574967384338379 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.85688
Louvain completed 21 runs in 0.6499230861663818 seconds
PhenoGraph complete in 1.4009625911712646 seconds
Found communities [-1, ... 11], with sizes: [148, 434, 306, 261, 245, 187, 150, 129, 113, 89, 78, 55, 30]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30634331703186035 seconds
Jaccard graph constructed in 0.3103601932525635 seconds
Wrote graph to binary file in 0.034491539001464844 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851977
After 3 runs, maximum modularity is Q = 0.854194
Louvain completed 23 runs in 0.8237855434417725 seconds
PhenoGraph complete in 1.4866650104522705 seconds
Found communities [-1, ... 13], with sizes: [141, 324, 273, 238, 231, 172, 159, 143, 116, 103, 82, 77, 76, 49, 41]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40821146965026855 seconds
Jaccard graph constructed in 0.30414676666259766 seconds
Wrote graph to binary file in 0.11415791511535645 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.855464
Louvain completed 21 runs in 0.6677343845367432 seconds
PhenoGraph complete in 1.504737138748169 seconds
Found communities [-1, ... 14], with sizes: [147, 339, 219, 203, 203, 203, 192, 161, 150, 117, 82, 73, 54, 35, 28, 19]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30776405334472656 seconds
Jaccard graph constructed in 0.3069312572479248 seconds
Wrote graph to binary file in 0.03258657455444336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.852832
Louvain completed 21 runs in 0.6703841686248779 seconds
PhenoGraph complete in 1.327599287033081 seconds
Found communities [-1, ... 15], with sizes: [150, 420, 231, 225, 222, 175, 171, 114, 109, 91, 90, 77, 45, 39, 39, 16, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30767202377319336 seconds
Jaccard graph constructed in 0.2992825508117676 seconds
Wrote graph to binary file in 0.12420010566711426 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.856174
Louvain completed 21 runs in 0.6723473072052002 seconds
PhenoGraph complete in 1.4187166690826416 seconds
Found communities [-1, ... 13], with sizes: [136, 400, 269, 264, 224, 192, 163, 154, 137, 97, 79, 40, 36, 21, 13]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3098146915435791 seconds
Jaccard graph constructed in 0.3055295944213867 seconds
Wrote graph to binary file in 0.035646915435791016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.854862
Louvain completed 21 runs in 0.6756174564361572 seconds
PhenoGraph complete in 1.337935209274292 seconds
Found communities [-1, ... 13], with sizes: [144, 291, 230, 223, 220, 210, 173, 172, 160, 102, 82, 75, 69, 38, 36]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2076280117034912 seconds
Jaccard graph constructed in 0.29665303230285645 seconds
Wrote graph to binary file in 0.03459882736206055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.855011
After 2 runs, maximum modularity is Q = 0.856402
After 5 runs, maximum modularity is Q = 0.857599
After 20 runs, maximum modularity is Q = 0.859197
Louvain completed 40 runs in 1.4657461643218994 seconds
PhenoGraph complete in 2.0164525508880615 seconds
Found communities [-1, ... 14], with sizes: [181, 330, 210, 207, 178, 165, 162, 161, 154, 132, 96, 78, 60, 52, 47, 12]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20734572410583496 seconds
Jaccard graph constructed in 0.290557861328125 seconds
Wrote graph to binary file in 0.13068079948425293 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.857174
Louvain completed 21 runs in 0.6959750652313232 seconds
PhenoGraph complete in 1.3394224643707275 seconds
Found communities [-1, ... 13], with sizes: [113, 438, 256, 199, 192, 175, 141, 139, 133, 133, 91, 82, 80, 42, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31047511100769043 seconds
Jaccard graph constructed in 0.3088827133178711 seconds
Wrote graph to binary file in 0.03481912612915039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.852402
Louvain completed 21 runs in 0.6792612075805664 seconds
PhenoGraph complete in 1.34515380859375 seconds
Found communities [-1, ... 12], with sizes: [143, 408, 229, 224, 223, 212, 194, 155, 129, 96, 77, 56, 43, 36]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3071448802947998 seconds
Jaccard graph constructed in 0.3109931945800781 seconds
Wrote graph to binary file in 0.13086557388305664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.85427
Louvain completed 21 runs in 0.6788380146026611 seconds
PhenoGraph complete in 1.4446029663085938 seconds
Found communities [-1, ... 14], with sizes: [133, 362, 242, 228, 204, 202, 193, 147, 131, 96, 81, 74, 50, 49, 21, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20951056480407715 seconds
Jaccard graph constructed in 0.2941608428955078 seconds
Wrote graph to binary file in 0.03218817710876465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.858102
After 4 runs, maximum modularity is Q = 0.859675
Louvain completed 24 runs in 0.8347189426422119 seconds
PhenoGraph complete in 1.3833091259002686 seconds
Found communities [-1, ... 15], with sizes: [138, 482, 236, 220, 182, 171, 134, 131, 130, 94, 84, 80, 57, 33, 24, 17, 12]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20773720741271973 seconds
Jaccard graph constructed in 0.27033162117004395 seconds
Wrote graph to binary file in 0.03422141075134277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.854994
After 3 runs, maximum modularity is Q = 0.856285
Louvain completed 23 runs in 0.8078761100769043 seconds
PhenoGraph complete in 1.3342747688293457 seconds
Found communities [-1, ... 12], with sizes: [142, 346, 250, 238, 195, 192, 181, 172, 132, 110, 98, 80, 45, 44]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3062746524810791 seconds
Jaccard graph constructed in 0.3097362518310547 seconds
Wrote graph to binary file in 0.12377023696899414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.854888
After 2 runs, maximum modularity is Q = 0.856021
Louvain completed 22 runs in 0.803412675857544 seconds
PhenoGraph complete in 1.5556888580322266 seconds
Found communities [-1, ... 12], with sizes: [141, 446, 245, 230, 210, 189, 134, 129, 125, 104, 86, 77, 74, 35]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3096494674682617 seconds
Jaccard graph constructed in 0.2952749729156494 seconds
Wrote graph to binary file in 0.0346531867980957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851256
After 3 runs, maximum modularity is Q = 0.853721
Louvain completed 23 runs in 0.8304622173309326 seconds
PhenoGraph complete in 1.4832394123077393 seconds
Found communities [-1, ... 15], with sizes: [124, 421, 242, 234, 180, 164, 148, 143, 130, 87, 83, 76, 67, 67, 26, 19, 14]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3072354793548584 seconds
Jaccard graph constructed in 0.2686634063720703 seconds
Wrote graph to binary file in 0.03485536575317383 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.852857
After 2 runs, maximum modularity is Q = 0.854892
Louvain completed 22 runs in 0.7863106727600098 seconds
PhenoGraph complete in 1.421738624572754 seconds
Found communities [-1, ... 15], with sizes: [143, 455, 230, 228, 189, 162, 124, 113, 104, 99, 82, 81, 76, 52, 42, 25, 20]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30680036544799805 seconds
Jaccard graph constructed in 0.41535520553588867 seconds
Wrote graph to binary file in 0.034188270568847656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.849737
After 14 runs, maximum modularity is Q = 0.850778
Louvain completed 34 runs in 1.1089999675750732 seconds
PhenoGraph complete in 1.8784644603729248 seconds
Found communities [-1, ... 15], with sizes: [143, 434, 237, 233, 226, 163, 155, 153, 139, 102, 72, 51, 39, 37, 17, 12, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3088347911834717 seconds
Jaccard graph constructed in 0.2695302963256836 seconds
Wrote graph to binary file in 0.03755521774291992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.857925
Louvain completed 21 runs in 0.7062399387359619 seconds
PhenoGraph complete in 1.3346896171569824 seconds
Found communities [-1, ... 14], with sizes: [133, 278, 223, 214, 187, 186, 172, 169, 132, 115, 85, 84, 74, 71, 68, 34]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3076784610748291 seconds
Jaccard graph constructed in 0.30925512313842773 seconds
Wrote graph to binary file in 0.1296396255493164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851976
Louvain completed 21 runs in 0.6892428398132324 seconds
PhenoGraph complete in 1.448503017425537 seconds
Found communities [-1, ... 14], with sizes: [106, 361, 236, 223, 187, 151, 148, 122, 115, 110, 103, 102, 79, 77, 66, 39]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30797600746154785 seconds
Jaccard graph constructed in 0.27184224128723145 seconds
Wrote graph to binary file in 0.03627347946166992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.852469
Louvain completed 21 runs in 0.6978359222412109 seconds
PhenoGraph complete in 1.3291597366333008 seconds
Found communities [-1, ... 16], with sizes: [151, 373, 209, 191, 189, 178, 166, 141, 125, 97, 96, 79, 61, 56, 51, 36, 13, 13]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.307117223739624 seconds
Jaccard graph constructed in 0.2681887149810791 seconds
Wrote graph to binary file in 0.03509783744812012 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.856575
After 4 runs, maximum modularity is Q = 0.857611
Louvain completed 24 runs in 0.8502638339996338 seconds
PhenoGraph complete in 1.4721050262451172 seconds
Found communities [-1, ... 16], with sizes: [153, 465, 260, 226, 177, 157, 152, 152, 132, 83, 79, 58, 38, 35, 22, 13, 12, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30603790283203125 seconds
Jaccard graph constructed in 0.31660890579223633 seconds
Wrote graph to binary file in 0.12119674682617188 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.854788
After 13 runs, maximum modularity is Q = 0.85586
Louvain completed 33 runs in 1.109736442565918 seconds
PhenoGraph complete in 1.8680367469787598 seconds
Found communities [-1, ... 14], with sizes: [138, 432, 211, 211, 192, 146, 145, 142, 122, 117, 105, 76, 59, 57, 54, 18]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3064150810241699 seconds
Jaccard graph constructed in 0.296893835067749 seconds
Wrote graph to binary file in 0.0340418815612793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.850866
After 2 runs, maximum modularity is Q = 0.854173
Louvain completed 22 runs in 0.7796761989593506 seconds
PhenoGraph complete in 1.4281444549560547 seconds
Found communities [-1, ... 13], with sizes: [137, 449, 233, 222, 216, 198, 149, 129, 99, 98, 96, 84, 53, 51, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3071146011352539 seconds
Jaccard graph constructed in 0.3070831298828125 seconds
Wrote graph to binary file in 0.12588262557983398 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.857457
Louvain completed 21 runs in 0.6961038112640381 seconds
PhenoGraph complete in 1.448686122894287 seconds
Found communities [-1, ... 13], with sizes: [149, 448, 239, 223, 201, 191, 133, 130, 119, 115, 82, 78, 66, 34, 17]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3065071105957031 seconds
Jaccard graph constructed in 0.27219343185424805 seconds
Wrote graph to binary file in 0.03264212608337402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.853015
After 2 runs, maximum modularity is Q = 0.854128
After 3 runs, maximum modularity is Q = 0.855507
Louvain completed 23 runs in 0.927466869354248 seconds
PhenoGraph complete in 1.5605666637420654 seconds
Found communities [-1, ... 12], with sizes: [119, 342, 313, 245, 236, 234, 166, 145, 103, 99, 76, 60, 48, 39]

In [12]:
sc.pp.normalize_per_cell(D322_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Nas1) # log transform the data
D322_Biop_Nas1.raw = D322_Biop_Nas1 # freeze the object (for later use of the raw state of it)
In [13]:
D322_Biop_Nas1 = D322_Biop_Nas1[:, D322_Biop_Nas1.var['ribo_genes']]
D322_Biop_Nas1
Out[13]:
View of AnnData object with n_obs × n_vars = 1780 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [14]:
D339_Biop_Nas1 = sc.read_10x_mtx(
    './D339_Biop_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D339_Biop_Nas1.var_names_make_unique()
D339_Biop_Nas1.obs['manip'] = 'D339_Biop_Nas1'
D339_Biop_Nas1.obs['position'] = 'Nasal'
D339_Biop_Nas1.obs['method'] = 'Biopsy'
D339_Biop_Nas1.obs['donor'] = 'D339'
D339_Biop_Nas1.obs['name'] = ['D339_Biop_Nas1_' + s for s in list(D339_Biop_Nas1.obs.index)]
D339_Biop_Nas1.obs_names = D339_Biop_Nas1.obs['name']
D339_Biop_Nas1
... reading from cache file ./cache/D339_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[14]:
AnnData object with n_obs × n_vars = 1917 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [15]:
sc.pl.highest_expr_genes(D339_Biop_Nas1, n_top=20)
In [16]:
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=0)
mito_genes = D339_Biop_Nas1.var_names.str.startswith('MT-')
D339_Biop_Nas1.obs['percent_mito'] = np.sum(
    D339_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.obs['n_counts'] = D339_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Nas1.to_df())
ribo_genes = D339_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Nas1.obs['percent_ribo'] = np.sum(
    D339_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Nas1.X, axis=1).A1
D339_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D339_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [17]:
sc.pp.filter_cells(D339_Biop_Nas1, min_genes=500)
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['n_counts'] < 40000, :]
D339_Biop_Nas1 = D339_Biop_Nas1[D339_Biop_Nas1.obs['percent_mito'] < 0.15, :]
filtered out 3 cells that have less than 500 genes expressed
In [18]:
# scrublet
scrub = scr.Scrublet(D339_Biop_Nas1.X, expected_doublet_rate=0.016) 
scrub = scr.Scrublet(D339_Biop_Nas1.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D339_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.47
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 41.5%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 1.3%
Elapsed time: 1.4 seconds
Out[18]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecaf6ff28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecaf04438>],
       dtype=object))
In [19]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Nas1.X).predict()
D339_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2066202163696289 seconds
Jaccard graph constructed in 0.3158748149871826 seconds
Wrote graph to binary file in 0.04000139236450195 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914813
After 3 runs, maximum modularity is Q = 0.916011
Louvain completed 23 runs in 0.8884556293487549 seconds
PhenoGraph complete in 1.4616329669952393 seconds
Found communities [-1, ... 21], with sizes: [94, 295, 221, 207, 133, 132, 132, 131, 121, 118, 104, 96, 93, 91, 66, 52, 50, 49, 41, 36, 33, 29, 24]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30696773529052734 seconds
Jaccard graph constructed in 0.3256070613861084 seconds
Wrote graph to binary file in 0.12011384963989258 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914471
Louvain completed 21 runs in 0.7269222736358643 seconds
PhenoGraph complete in 1.4904849529266357 seconds
Found communities [-1, ... 19], with sizes: [87, 383, 243, 217, 167, 166, 125, 119, 118, 96, 96, 85, 70, 64, 57, 51, 50, 50, 40, 35, 29]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31159067153930664 seconds
Jaccard graph constructed in 0.32219552993774414 seconds
Wrote graph to binary file in 0.04117703437805176 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917203
Louvain completed 21 runs in 0.7120158672332764 seconds
PhenoGraph complete in 1.398862361907959 seconds
Found communities [-1, ... 20], with sizes: [92, 265, 231, 159, 156, 155, 145, 112, 112, 110, 109, 97, 94, 92, 84, 71, 56, 49, 48, 43, 35, 33]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081836700439453 seconds
Jaccard graph constructed in 0.31020474433898926 seconds
Wrote graph to binary file in 0.12661218643188477 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914894
Louvain completed 21 runs in 0.7269814014434814 seconds
PhenoGraph complete in 1.4843578338623047 seconds
Found communities [-1, ... 19], with sizes: [84, 273, 250, 203, 155, 149, 149, 141, 114, 109, 109, 107, 98, 81, 61, 55, 51, 44, 41, 39, 35]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20717930793762207 seconds
Jaccard graph constructed in 0.31676197052001953 seconds
Wrote graph to binary file in 0.0416409969329834 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912832
After 2 runs, maximum modularity is Q = 0.914457
After 4 runs, maximum modularity is Q = 0.915946
Louvain completed 24 runs in 1.0415267944335938 seconds
PhenoGraph complete in 1.621366024017334 seconds
Found communities [-1, ... 20], with sizes: [104, 305, 186, 173, 151, 144, 141, 139, 126, 103, 100, 97, 87, 85, 72, 66, 53, 51, 50, 42, 38, 35]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3072516918182373 seconds
Jaccard graph constructed in 0.32131314277648926 seconds
Wrote graph to binary file in 0.1205141544342041 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.916629
Louvain completed 21 runs in 0.7400810718536377 seconds
PhenoGraph complete in 1.5056416988372803 seconds
Found communities [-1, ... 19], with sizes: [82, 338, 226, 178, 145, 142, 142, 131, 112, 112, 104, 92, 91, 86, 85, 61, 52, 51, 45, 39, 34]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20781898498535156 seconds
Jaccard graph constructed in 0.3088233470916748 seconds
Wrote graph to binary file in 0.04128599166870117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.916194
After 13 runs, maximum modularity is Q = 0.917268
Louvain completed 33 runs in 1.1683826446533203 seconds
PhenoGraph complete in 1.7367794513702393 seconds
Found communities [-1, ... 21], with sizes: [98, 262, 159, 150, 137, 135, 134, 123, 119, 115, 111, 111, 103, 102, 100, 61, 59, 53, 53, 50, 42, 36, 35]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3067307472229004 seconds
Jaccard graph constructed in 0.33370113372802734 seconds
Wrote graph to binary file in 0.12057757377624512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913375
After 9 runs, maximum modularity is Q = 0.914495
Louvain completed 29 runs in 1.048865795135498 seconds
PhenoGraph complete in 1.826646327972412 seconds
Found communities [-1, ... 20], with sizes: [96, 241, 223, 175, 147, 138, 124, 123, 118, 113, 112, 108, 105, 97, 81, 74, 57, 51, 49, 41, 40, 35]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2067573070526123 seconds
Jaccard graph constructed in 0.31972408294677734 seconds
Wrote graph to binary file in 0.04212474822998047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915562
Louvain completed 21 runs in 0.7448432445526123 seconds
PhenoGraph complete in 1.3286876678466797 seconds
Found communities [-1, ... 20], with sizes: [103, 293, 183, 169, 167, 151, 145, 143, 109, 108, 108, 99, 87, 81, 71, 54, 53, 51, 50, 47, 41, 35]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20834088325500488 seconds
Jaccard graph constructed in 0.322249174118042 seconds
Wrote graph to binary file in 0.12173628807067871 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915861
After 4 runs, maximum modularity is Q = 0.917071
Louvain completed 24 runs in 0.8866989612579346 seconds
PhenoGraph complete in 1.5576601028442383 seconds
Found communities [-1, ... 21], with sizes: [88, 260, 231, 222, 147, 142, 129, 122, 119, 112, 109, 104, 87, 86, 70, 55, 51, 50, 41, 38, 36, 30, 19]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2088916301727295 seconds
Jaccard graph constructed in 0.33864760398864746 seconds
Wrote graph to binary file in 0.04112577438354492 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915896
Louvain completed 21 runs in 0.7342045307159424 seconds
PhenoGraph complete in 1.3354604244232178 seconds
Found communities [-1, ... 21], with sizes: [88, 248, 210, 152, 133, 129, 128, 127, 123, 120, 114, 112, 110, 110, 86, 65, 51, 51, 50, 40, 38, 36, 27]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31146860122680664 seconds
Jaccard graph constructed in 0.31763625144958496 seconds
Wrote graph to binary file in 0.146104097366333 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.916158
Louvain completed 21 runs in 0.7209784984588623 seconds
PhenoGraph complete in 1.51529860496521 seconds
Found communities [-1, ... 19], with sizes: [112, 247, 223, 204, 168, 158, 144, 144, 123, 115, 109, 107, 89, 71, 70, 52, 52, 49, 41, 40, 30]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30747437477111816 seconds
Jaccard graph constructed in 0.3243536949157715 seconds
Wrote graph to binary file in 0.04296755790710449 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915712
Louvain completed 21 runs in 0.7537517547607422 seconds
PhenoGraph complete in 1.4425067901611328 seconds
Found communities [-1, ... 19], with sizes: [96, 310, 212, 211, 168, 160, 141, 136, 108, 108, 93, 87, 86, 80, 77, 59, 52, 51, 43, 36, 34]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30921292304992676 seconds
Jaccard graph constructed in 0.33057618141174316 seconds
Wrote graph to binary file in 0.15000510215759277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914577
After 10 runs, maximum modularity is Q = 0.915875
Louvain completed 30 runs in 1.1045057773590088 seconds
PhenoGraph complete in 1.91184401512146 seconds
Found communities [-1, ... 21], with sizes: [105, 278, 205, 192, 183, 156, 135, 115, 111, 107, 105, 99, 97, 88, 69, 53, 49, 38, 37, 36, 34, 28, 28]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20765280723571777 seconds
Jaccard graph constructed in 0.3321225643157959 seconds
Wrote graph to binary file in 0.04221796989440918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914789
Louvain completed 21 runs in 0.7014024257659912 seconds
PhenoGraph complete in 1.2956562042236328 seconds
Found communities [-1, ... 19], with sizes: [84, 304, 218, 218, 135, 132, 132, 121, 115, 113, 113, 103, 96, 93, 89, 59, 54, 51, 43, 40, 35]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30840134620666504 seconds
Jaccard graph constructed in 0.31452107429504395 seconds
Wrote graph to binary file in 0.041329383850097656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914288
Louvain completed 21 runs in 0.6980297565460205 seconds
PhenoGraph complete in 1.3723759651184082 seconds
Found communities [-1, ... 20], with sizes: [80, 292, 199, 165, 164, 150, 139, 128, 127, 110, 104, 97, 94, 87, 85, 75, 54, 52, 47, 41, 34, 24]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31139469146728516 seconds
Jaccard graph constructed in 0.4412109851837158 seconds
Wrote graph to binary file in 0.042226552963256836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917063
Louvain completed 21 runs in 0.7077591419219971 seconds
PhenoGraph complete in 1.5150043964385986 seconds
Found communities [-1, ... 20], with sizes: [85, 284, 254, 190, 153, 141, 119, 116, 112, 111, 110, 98, 94, 90, 77, 54, 53, 51, 48, 47, 31, 30]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3068504333496094 seconds
Jaccard graph constructed in 0.3177778720855713 seconds
Wrote graph to binary file in 0.04327201843261719 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913495
After 2 runs, maximum modularity is Q = 0.914576
Louvain completed 22 runs in 0.8603401184082031 seconds
PhenoGraph complete in 1.540285587310791 seconds
Found communities [-1, ... 20], with sizes: [87, 215, 204, 204, 153, 141, 137, 134, 120, 114, 112, 109, 98, 95, 92, 58, 56, 51, 49, 43, 40, 36]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3102548122406006 seconds
Jaccard graph constructed in 0.3326091766357422 seconds
Wrote graph to binary file in 0.14270853996276855 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.9174
Louvain completed 21 runs in 0.7318401336669922 seconds
PhenoGraph complete in 1.528918981552124 seconds
Found communities [-1, ... 20], with sizes: [81, 351, 256, 164, 133, 133, 128, 122, 113, 111, 104, 97, 88, 82, 76, 56, 51, 50, 42, 40, 39, 31]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4072751998901367 seconds
Jaccard graph constructed in 0.31981754302978516 seconds
Wrote graph to binary file in 0.04160428047180176 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913721
After 9 runs, maximum modularity is Q = 0.915008
Louvain completed 29 runs in 1.0451924800872803 seconds
PhenoGraph complete in 1.8252418041229248 seconds
Found communities [-1, ... 21], with sizes: [82, 228, 191, 164, 163, 155, 143, 124, 115, 107, 105, 104, 95, 90, 87, 79, 60, 53, 49, 41, 40, 38, 35]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3087790012359619 seconds
Jaccard graph constructed in 0.34668397903442383 seconds
Wrote graph to binary file in 0.1182863712310791 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91467
Louvain completed 21 runs in 0.7347433567047119 seconds
PhenoGraph complete in 1.5200309753417969 seconds
Found communities [-1, ... 20], with sizes: [94, 247, 203, 194, 166, 136, 136, 121, 120, 114, 112, 101, 94, 93, 90, 57, 56, 52, 49, 42, 39, 32]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31101465225219727 seconds
Jaccard graph constructed in 0.3258686065673828 seconds
Wrote graph to binary file in 0.04131960868835449 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913669
Louvain completed 21 runs in 0.7208325862884521 seconds
PhenoGraph complete in 1.4099571704864502 seconds
Found communities [-1, ... 21], with sizes: [64, 222, 222, 168, 156, 154, 133, 118, 116, 112, 110, 109, 107, 107, 87, 66, 52, 50, 50, 38, 38, 38, 31]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30809998512268066 seconds
Jaccard graph constructed in 0.31885600090026855 seconds
Wrote graph to binary file in 0.13667058944702148 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.916988
Louvain completed 21 runs in 0.7105915546417236 seconds
PhenoGraph complete in 1.4888038635253906 seconds
Found communities [-1, ... 21], with sizes: [75, 276, 264, 192, 148, 145, 143, 120, 117, 112, 95, 89, 84, 83, 79, 54, 51, 49, 44, 40, 31, 30, 27]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.308124303817749 seconds
Jaccard graph constructed in 0.3518354892730713 seconds
Wrote graph to binary file in 0.04296588897705078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913021
After 2 runs, maximum modularity is Q = 0.914831
Louvain completed 22 runs in 0.8846080303192139 seconds
PhenoGraph complete in 1.6036791801452637 seconds
Found communities [-1, ... 19], with sizes: [96, 304, 238, 206, 160, 156, 151, 125, 110, 108, 94, 89, 89, 82, 71, 56, 53, 50, 39, 38, 33]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30726146697998047 seconds
Jaccard graph constructed in 0.3231089115142822 seconds
Wrote graph to binary file in 0.12204957008361816 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915153
After 2 runs, maximum modularity is Q = 0.916807
Louvain completed 22 runs in 0.840766191482544 seconds
PhenoGraph complete in 1.6069214344024658 seconds
Found communities [-1, ... 19], with sizes: [92, 267, 256, 201, 179, 151, 147, 125, 116, 114, 112, 93, 86, 86, 67, 53, 52, 48, 35, 35, 33]

In [20]:
sc.pp.normalize_per_cell(D339_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Nas1) # log transform the data
D339_Biop_Nas1.raw = D339_Biop_Nas1 # freeze the object (for later use of the raw state of it)
In [21]:
D339_Biop_Nas1 = D339_Biop_Nas1[:, D339_Biop_Nas1.var['ribo_genes']]
D339_Biop_Nas1
Out[21]:
View of AnnData object with n_obs × n_vars = 1879 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [22]:
D344_Biop_Nas1 = sc.read_10x_mtx(
    './D344_Biop_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D344_Biop_Nas1.var_names_make_unique()
D344_Biop_Nas1.obs['manip'] = 'D344_Biop_Nas1'
D344_Biop_Nas1.obs['position'] = 'Nasal'
D344_Biop_Nas1.obs['method'] = 'Biopsy'
D344_Biop_Nas1.obs['donor'] = 'D344'
D344_Biop_Nas1.obs['name'] = ['D344_Biop_Nas1_' + s for s in list(D344_Biop_Nas1.obs.index)]
D344_Biop_Nas1.obs_names = D344_Biop_Nas1.obs['name']
D344_Biop_Nas1
... reading from cache file ./cache/D344_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[22]:
AnnData object with n_obs × n_vars = 2121 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [23]:
sc.pl.highest_expr_genes(D344_Biop_Nas1, n_top=20)
In [24]:
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=0)
mito_genes = D344_Biop_Nas1.var_names.str.startswith('MT-')
D344_Biop_Nas1.obs['percent_mito'] = np.sum(
    D344_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.obs['n_counts'] = D344_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Nas1.to_df())
ribo_genes = D344_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Nas1.obs['percent_ribo'] = np.sum(
    D344_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Nas1.X, axis=1).A1
D344_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D344_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [25]:
sc.pp.filter_cells(D344_Biop_Nas1, min_genes=500)
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['n_counts'] < 50000, :]
D344_Biop_Nas1 = D344_Biop_Nas1[D344_Biop_Nas1.obs['percent_mito'] < 0.1, :]
filtered out 6 cells that have less than 500 genes expressed
In [26]:
# scrublet
scrub = scr.Scrublet(D344_Biop_Nas1.X, expected_doublet_rate=0.017)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D344_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.18
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 28.9%
Overall doublet rate:
	Expected   = 1.7%
	Estimated  = 2.3%
Elapsed time: 1.5 seconds
Out[26]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbbb5860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecc770668>],
       dtype=object))
In [27]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Nas1.X).predict()
D344_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3668093681335449 seconds
Jaccard graph constructed in 0.37114810943603516 seconds
Wrote graph to binary file in 0.04429221153259277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905021
Louvain completed 21 runs in 0.8386020660400391 seconds
PhenoGraph complete in 1.6339306831359863 seconds
Found communities [-1, ... 19], with sizes: [183, 344, 283, 240, 182, 176, 175, 132, 124, 117, 98, 86, 86, 72, 71, 60, 58, 55, 36, 23, 17]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30774354934692383 seconds
Jaccard graph constructed in 0.37660694122314453 seconds
Wrote graph to binary file in 0.0534367561340332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902513
Louvain completed 21 runs in 0.8093357086181641 seconds
PhenoGraph complete in 1.5636885166168213 seconds
Found communities [-1, ... 18], with sizes: [171, 345, 324, 294, 248, 232, 129, 117, 112, 98, 76, 73, 72, 68, 67, 62, 57, 37, 19, 17]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3166005611419678 seconds
Jaccard graph constructed in 0.48200368881225586 seconds
Wrote graph to binary file in 0.04665350914001465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903171
Louvain completed 21 runs in 0.8213727474212646 seconds
PhenoGraph complete in 1.683635950088501 seconds
Found communities [-1, ... 22], with sizes: [132, 389, 363, 171, 156, 151, 142, 120, 104, 102, 101, 87, 85, 75, 74, 67, 63, 60, 51, 42, 33, 18, 17, 15]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3100764751434326 seconds
Jaccard graph constructed in 0.35109758377075195 seconds
Wrote graph to binary file in 0.04471158981323242 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905114
Louvain completed 21 runs in 0.8178925514221191 seconds
PhenoGraph complete in 1.5368318557739258 seconds
Found communities [-1, ... 19], with sizes: [140, 374, 329, 249, 244, 217, 180, 118, 100, 97, 92, 86, 68, 66, 57, 54, 50, 40, 23, 18, 16]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3082611560821533 seconds
Jaccard graph constructed in 0.48171234130859375 seconds
Wrote graph to binary file in 0.043807029724121094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904891
Louvain completed 21 runs in 0.8361577987670898 seconds
PhenoGraph complete in 1.6836771965026855 seconds
Found communities [-1, ... 19], with sizes: [159, 363, 300, 217, 189, 184, 172, 157, 126, 100, 97, 88, 83, 71, 66, 61, 60, 53, 37, 19, 16]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3137693405151367 seconds
Jaccard graph constructed in 0.37295007705688477 seconds
Wrote graph to binary file in 0.05366945266723633 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90592
Louvain completed 21 runs in 0.8222959041595459 seconds
PhenoGraph complete in 1.579627275466919 seconds
Found communities [-1, ... 20], with sizes: [183, 359, 225, 200, 186, 185, 143, 127, 116, 109, 102, 101, 91, 84, 76, 60, 57, 54, 53, 48, 47, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30945873260498047 seconds
Jaccard graph constructed in 0.3735170364379883 seconds
Wrote graph to binary file in 0.1431713104248047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901523
After 11 runs, maximum modularity is Q = 0.903058
Louvain completed 31 runs in 1.2631654739379883 seconds
PhenoGraph complete in 2.102349042892456 seconds
Found communities [-1, ... 19], with sizes: [172, 360, 250, 216, 203, 201, 160, 148, 137, 117, 110, 99, 68, 63, 60, 59, 58, 54, 39, 27, 17]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3099653720855713 seconds
Jaccard graph constructed in 0.36966371536254883 seconds
Wrote graph to binary file in 0.04363894462585449 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905927
Louvain completed 21 runs in 0.8191144466400146 seconds
PhenoGraph complete in 1.555978775024414 seconds
Found communities [-1, ... 21], with sizes: [151, 346, 265, 253, 177, 174, 149, 136, 130, 103, 101, 96, 86, 69, 66, 57, 57, 56, 56, 35, 27, 17, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30855607986450195 seconds
Jaccard graph constructed in 0.35804080963134766 seconds
Wrote graph to binary file in 0.14003729820251465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906856
Louvain completed 21 runs in 0.8057575225830078 seconds
PhenoGraph complete in 1.623595952987671 seconds
Found communities [-1, ... 20], with sizes: [186, 356, 347, 243, 186, 154, 148, 112, 105, 104, 101, 90, 77, 72, 65, 62, 60, 49, 38, 25, 21, 17]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3120462894439697 seconds
Jaccard graph constructed in 0.37120652198791504 seconds
Wrote graph to binary file in 0.04476356506347656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90531
Louvain completed 21 runs in 0.7785615921020508 seconds
PhenoGraph complete in 1.5257573127746582 seconds
Found communities [-1, ... 22], with sizes: [179, 232, 212, 212, 203, 190, 186, 148, 110, 107, 107, 103, 100, 95, 68, 61, 60, 53, 50, 49, 35, 22, 20, 16]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30761241912841797 seconds
Jaccard graph constructed in 0.3466176986694336 seconds
Wrote graph to binary file in 0.14148497581481934 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899856
After 3 runs, maximum modularity is Q = 0.901909
Louvain completed 23 runs in 0.9796733856201172 seconds
PhenoGraph complete in 1.7867050170898438 seconds
Found communities [-1, ... 19], with sizes: [183, 344, 238, 219, 210, 169, 159, 149, 123, 112, 98, 94, 77, 77, 76, 59, 59, 58, 56, 42, 16]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3099222183227539 seconds
Jaccard graph constructed in 0.35921549797058105 seconds
Wrote graph to binary file in 0.044771432876586914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904101
Louvain completed 21 runs in 0.8093023300170898 seconds
PhenoGraph complete in 1.5360081195831299 seconds
Found communities [-1, ... 21], with sizes: [178, 352, 261, 213, 167, 166, 143, 134, 113, 109, 100, 90, 80, 78, 76, 72, 65, 56, 56, 46, 32, 17, 14]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31110095977783203 seconds
Jaccard graph constructed in 0.3798489570617676 seconds
Wrote graph to binary file in 0.14588046073913574 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903504
Louvain completed 21 runs in 0.8239104747772217 seconds
PhenoGraph complete in 1.6797006130218506 seconds
Found communities [-1, ... 20], with sizes: [200, 377, 210, 186, 184, 182, 171, 167, 145, 133, 113, 97, 86, 64, 64, 54, 52, 40, 39, 23, 16, 15]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3103766441345215 seconds
Jaccard graph constructed in 0.37268805503845215 seconds
Wrote graph to binary file in 0.04918384552001953 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902063
Louvain completed 21 runs in 0.8856055736541748 seconds
PhenoGraph complete in 1.6465637683868408 seconds
Found communities [-1, ... 19], with sizes: [157, 357, 280, 240, 199, 199, 195, 123, 105, 102, 96, 95, 79, 68, 62, 60, 59, 58, 43, 24, 17]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3075556755065918 seconds
Jaccard graph constructed in 0.41863036155700684 seconds
Wrote graph to binary file in 0.12459564208984375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900123
After 3 runs, maximum modularity is Q = 0.902262
Louvain completed 23 runs in 0.9938900470733643 seconds
PhenoGraph complete in 1.8578953742980957 seconds
Found communities [-1, ... 18], with sizes: [160, 376, 341, 238, 214, 185, 169, 151, 110, 104, 103, 88, 74, 68, 62, 54, 50, 37, 17, 17]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081660270690918 seconds
Jaccard graph constructed in 0.3730311393737793 seconds
Wrote graph to binary file in 0.05182766914367676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903571
Louvain completed 21 runs in 0.8278617858886719 seconds
PhenoGraph complete in 1.5791656970977783 seconds
Found communities [-1, ... 20], with sizes: [166, 331, 236, 225, 205, 202, 193, 143, 106, 103, 98, 90, 84, 66, 65, 63, 61, 51, 50, 35, 23, 22]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3079235553741455 seconds
Jaccard graph constructed in 0.36020565032958984 seconds
Wrote graph to binary file in 0.12332344055175781 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903427
Louvain completed 21 runs in 0.8188588619232178 seconds
PhenoGraph complete in 1.6239402294158936 seconds
Found communities [-1, ... 22], with sizes: [153, 349, 197, 196, 196, 189, 168, 146, 124, 106, 101, 85, 83, 75, 75, 71, 66, 59, 50, 47, 38, 17, 14, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3135530948638916 seconds
Jaccard graph constructed in 0.3795955181121826 seconds
Wrote graph to binary file in 0.04305911064147949 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901231
After 2 runs, maximum modularity is Q = 0.9023
Louvain completed 22 runs in 0.9632749557495117 seconds
PhenoGraph complete in 1.7123239040374756 seconds
Found communities [-1, ... 19], with sizes: [175, 389, 351, 226, 215, 177, 124, 109, 101, 99, 91, 87, 76, 74, 73, 64, 61, 58, 36, 17, 15]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30730199813842773 seconds
Jaccard graph constructed in 0.3530082702636719 seconds
Wrote graph to binary file in 0.1405472755432129 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903312
Louvain completed 21 runs in 0.8128602504730225 seconds
PhenoGraph complete in 1.6249172687530518 seconds
Found communities [-1, ... 20], with sizes: [145, 339, 309, 198, 192, 154, 150, 132, 114, 108, 104, 101, 91, 86, 76, 74, 72, 58, 45, 42, 16, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3109710216522217 seconds
Jaccard graph constructed in 0.3767588138580322 seconds
Wrote graph to binary file in 0.04238247871398926 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902124
Louvain completed 21 runs in 0.8175806999206543 seconds
PhenoGraph complete in 1.560603380203247 seconds
Found communities [-1, ... 19], with sizes: [197, 342, 259, 199, 190, 176, 165, 129, 124, 118, 100, 97, 82, 82, 72, 64, 59, 52, 52, 42, 17]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30885910987854004 seconds
Jaccard graph constructed in 0.3689866065979004 seconds
Wrote graph to binary file in 0.14580225944519043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899958
After 3 runs, maximum modularity is Q = 0.901022
Louvain completed 23 runs in 1.0120658874511719 seconds
PhenoGraph complete in 1.8503143787384033 seconds
Found communities [-1, ... 20], with sizes: [171, 356, 349, 192, 189, 170, 165, 124, 121, 102, 87, 78, 71, 70, 64, 63, 59, 57, 54, 39, 19, 18]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081333637237549 seconds
Jaccard graph constructed in 0.37570738792419434 seconds
Wrote graph to binary file in 0.04296064376831055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901547
After 4 runs, maximum modularity is Q = 0.902717
Louvain completed 24 runs in 1.0371849536895752 seconds
PhenoGraph complete in 1.7782173156738281 seconds
Found communities [-1, ... 22], with sizes: [179, 327, 270, 183, 181, 178, 144, 128, 101, 95, 95, 93, 88, 82, 68, 67, 61, 58, 53, 53, 39, 33, 23, 19]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30927157402038574 seconds
Jaccard graph constructed in 0.37598204612731934 seconds
Wrote graph to binary file in 0.12826943397521973 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903794
After 2 runs, maximum modularity is Q = 0.905582
Louvain completed 22 runs in 0.9593186378479004 seconds
PhenoGraph complete in 1.786625862121582 seconds
Found communities [-1, ... 18], with sizes: [193, 376, 348, 241, 197, 166, 145, 132, 115, 102, 99, 84, 75, 73, 71, 58, 57, 42, 27, 17]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30988073348999023 seconds
Jaccard graph constructed in 0.3881664276123047 seconds
Wrote graph to binary file in 0.0462191104888916 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904599
Louvain completed 21 runs in 0.8111028671264648 seconds
PhenoGraph complete in 1.5830721855163574 seconds
Found communities [-1, ... 20], with sizes: [131, 335, 323, 264, 199, 170, 166, 133, 115, 110, 89, 89, 71, 70, 68, 60, 59, 56, 56, 19, 19, 16]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081212043762207 seconds
Jaccard graph constructed in 0.36143040657043457 seconds
Wrote graph to binary file in 0.14709806442260742 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902482
Louvain completed 21 runs in 0.8033449649810791 seconds
PhenoGraph complete in 1.6391990184783936 seconds
Found communities [-1, ... 18], with sizes: [164, 359, 308, 215, 212, 190, 161, 156, 154, 100, 97, 93, 73, 66, 66, 65, 61, 43, 18, 17]

In [28]:
sc.pp.normalize_per_cell(D344_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Nas1) # log transform the data
D344_Biop_Nas1.raw = D344_Biop_Nas1 # freeze the object (for later use of the raw state of it)
In [29]:
D344_Biop_Nas1 = D344_Biop_Nas1[:, D344_Biop_Nas1.var['ribo_genes']]
D344_Biop_Nas1
Out[29]:
View of AnnData object with n_obs × n_vars = 2095 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [30]:
D345_Biop_Nas1 = sc.read_10x_mtx(
    './D345_Biop_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D345_Biop_Nas1.var_names_make_unique()
D345_Biop_Nas1.obs['manip'] = 'D345_Biop_Nas1'
D345_Biop_Nas1.obs['position'] = 'Nasal'
D345_Biop_Nas1.obs['method'] = 'Biopsy'
D345_Biop_Nas1.obs['donor'] = 'D345'
D345_Biop_Nas1.obs['name'] = ['D345_Biop_Nas1_' + s for s in list(D345_Biop_Nas1.obs.index)]
D345_Biop_Nas1.obs_names = D345_Biop_Nas1.obs['name']
D345_Biop_Nas1
... reading from cache file ./cache/D345_Biop_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[30]:
AnnData object with n_obs × n_vars = 3259 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [31]:
sc.pl.highest_expr_genes(D345_Biop_Nas1, n_top=20)
In [32]:
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=0)
mito_genes = D345_Biop_Nas1.var_names.str.startswith('MT-')
D345_Biop_Nas1.obs['percent_mito'] = np.sum(
    D345_Biop_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.obs['n_counts'] = D345_Biop_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D345_Biop_Nas1.to_df())
ribo_genes = D345_Biop_Nas1.to_df().columns.isin(RB_genes_in_df)
D345_Biop_Nas1.obs['percent_ribo'] = np.sum(
    D345_Biop_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D345_Biop_Nas1.X, axis=1).A1
D345_Biop_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D345_Biop_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [33]:
sc.pp.filter_cells(D345_Biop_Nas1, min_genes=500)
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['n_counts'] < 20000, :]
D345_Biop_Nas1 = D345_Biop_Nas1[D345_Biop_Nas1.obs['percent_mito'] < 0.2, :]
filtered out 32 cells that have less than 500 genes expressed
In [34]:
# scrublet
scrub = scr.Scrublet(D345_Biop_Nas1.X, expected_doublet_rate=0.025)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D345_Biop_Nas1.obs['doublet_scores'] = doublet_scores
D345_Biop_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.20
Detected doublet rate = 1.0%
Estimated detectable doublet fraction = 33.4%
Overall doublet rate:
	Expected   = 2.5%
	Estimated  = 3.0%
Elapsed time: 2.1 seconds
Out[34]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb8d51208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecc4fa518>],
       dtype=object))
In [35]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D345_Biop_Nas1.X).predict()
D345_Biop_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.509113073348999 seconds
Jaccard graph constructed in 0.47003698348999023 seconds
Wrote graph to binary file in 0.16583967208862305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875367
Louvain completed 21 runs in 1.0838100910186768 seconds
PhenoGraph complete in 2.2446818351745605 seconds
Found communities [-1, ... 20], with sizes: [55, 513, 452, 370, 293, 292, 270, 249, 240, 213, 196, 129, 123, 122, 110, 95, 64, 64, 58, 57, 53, 13]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.509962797164917 seconds
Jaccard graph constructed in 0.4884061813354492 seconds
Wrote graph to binary file in 0.07520413398742676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876782
After 8 runs, maximum modularity is Q = 0.877922
Louvain completed 28 runs in 1.4965903759002686 seconds
PhenoGraph complete in 2.58528733253479 seconds
Found communities [-1, ... 19], with sizes: [60, 682, 475, 397, 327, 290, 257, 251, 237, 198, 137, 136, 105, 82, 73, 72, 66, 61, 56, 55, 14]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5157821178436279 seconds
Jaccard graph constructed in 0.6081719398498535 seconds
Wrote graph to binary file in 0.07874584197998047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877342
After 3 runs, maximum modularity is Q = 0.879469
Louvain completed 23 runs in 1.3534622192382812 seconds
PhenoGraph complete in 2.5744004249572754 seconds
Found communities [-1, ... 20], with sizes: [54, 499, 459, 418, 318, 288, 266, 261, 243, 232, 214, 135, 116, 110, 100, 73, 72, 60, 52, 37, 13, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5116434097290039 seconds
Jaccard graph constructed in 0.48578476905822754 seconds
Wrote graph to binary file in 0.2102823257446289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878289
After 2 runs, maximum modularity is Q = 0.879619
After 6 runs, maximum modularity is Q = 0.881189
Louvain completed 26 runs in 1.5322465896606445 seconds
PhenoGraph complete in 2.75622820854187 seconds
Found communities [-1, ... 20], with sizes: [76, 444, 394, 375, 344, 277, 276, 261, 247, 233, 194, 177, 136, 102, 102, 92, 91, 71, 61, 52, 13, 13]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.509721040725708 seconds
Jaccard graph constructed in 0.4993550777435303 seconds
Wrote graph to binary file in 0.16241002082824707 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875284
After 8 runs, maximum modularity is Q = 0.876389
After 14 runs, maximum modularity is Q = 0.877973
Louvain completed 34 runs in 1.9067745208740234 seconds
PhenoGraph complete in 3.0941972732543945 seconds
Found communities [-1, ... 20], with sizes: [57, 518, 468, 446, 380, 342, 325, 248, 243, 206, 122, 119, 102, 94, 66, 64, 60, 59, 56, 25, 20, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5105977058410645 seconds
Jaccard graph constructed in 0.48778533935546875 seconds
Wrote graph to binary file in 0.16517400741577148 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874687
After 21 runs, maximum modularity is Q = 0.875736
Louvain completed 41 runs in 2.0271809101104736 seconds
PhenoGraph complete in 3.2078487873077393 seconds
Found communities [-1, ... 19], with sizes: [61, 515, 482, 407, 368, 330, 318, 264, 241, 210, 190, 135, 133, 101, 75, 58, 54, 41, 23, 14, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5117900371551514 seconds
Jaccard graph constructed in 0.5218122005462646 seconds
Wrote graph to binary file in 0.1895432472229004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872572
After 2 runs, maximum modularity is Q = 0.874746
After 9 runs, maximum modularity is Q = 0.876278
Louvain completed 29 runs in 1.7165610790252686 seconds
PhenoGraph complete in 2.9558792114257812 seconds
Found communities [-1, ... 20], with sizes: [45, 646, 482, 314, 308, 282, 265, 254, 240, 196, 185, 136, 115, 114, 91, 81, 70, 67, 63, 51, 13, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5092618465423584 seconds
Jaccard graph constructed in 0.5011942386627197 seconds
Wrote graph to binary file in 0.07474088668823242 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877227
After 4 runs, maximum modularity is Q = 0.878352
Louvain completed 24 runs in 1.3611412048339844 seconds
PhenoGraph complete in 2.462210178375244 seconds
Found communities [-1, ... 19], with sizes: [58, 479, 421, 416, 335, 325, 287, 274, 259, 186, 179, 143, 141, 137, 92, 64, 61, 57, 52, 51, 14]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5097126960754395 seconds
Jaccard graph constructed in 0.473783016204834 seconds
Wrote graph to binary file in 0.18263554573059082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876872
After 2 runs, maximum modularity is Q = 0.878572
Louvain completed 22 runs in 1.2276060581207275 seconds
PhenoGraph complete in 2.415370225906372 seconds
Found communities [-1, ... 19], with sizes: [52, 529, 466, 438, 381, 340, 273, 265, 198, 177, 134, 131, 129, 116, 110, 85, 69, 59, 53, 14, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5107555389404297 seconds
Jaccard graph constructed in 0.49406886100769043 seconds
Wrote graph to binary file in 0.17885661125183105 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882089
Louvain completed 21 runs in 1.0463809967041016 seconds
PhenoGraph complete in 2.2454686164855957 seconds
Found communities [-1, ... 19], with sizes: [52, 492, 457, 434, 311, 310, 305, 302, 266, 194, 182, 140, 117, 105, 82, 77, 65, 60, 54, 14, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5092160701751709 seconds
Jaccard graph constructed in 0.49118804931640625 seconds
Wrote graph to binary file in 0.16657304763793945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875191
After 2 runs, maximum modularity is Q = 0.876792
Louvain completed 22 runs in 1.2382164001464844 seconds
PhenoGraph complete in 2.4236416816711426 seconds
Found communities [-1, ... 19], with sizes: [41, 604, 448, 368, 343, 342, 325, 291, 256, 230, 192, 142, 99, 73, 69, 64, 54, 37, 27, 15, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5095369815826416 seconds
Jaccard graph constructed in 0.509570837020874 seconds
Wrote graph to binary file in 0.19415497779846191 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877646
After 11 runs, maximum modularity is Q = 0.879248
Louvain completed 31 runs in 1.5875606536865234 seconds
PhenoGraph complete in 2.8171303272247314 seconds
Found communities [-1, ... 18], with sizes: [52, 485, 438, 436, 340, 329, 313, 310, 244, 189, 164, 162, 136, 102, 72, 67, 64, 61, 52, 15]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5167100429534912 seconds
Jaccard graph constructed in 0.5146276950836182 seconds
Wrote graph to binary file in 0.07436084747314453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877752
Louvain completed 21 runs in 1.036419153213501 seconds
PhenoGraph complete in 2.167802095413208 seconds
Found communities [-1, ... 20], with sizes: [61, 485, 338, 315, 300, 299, 291, 281, 272, 246, 226, 162, 152, 143, 108, 94, 66, 59, 55, 51, 15, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5120022296905518 seconds
Jaccard graph constructed in 0.4786539077758789 seconds
Wrote graph to binary file in 0.18381333351135254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876809
Louvain completed 21 runs in 1.106832504272461 seconds
PhenoGraph complete in 2.2989614009857178 seconds
Found communities [-1, ... 18], with sizes: [53, 581, 560, 413, 306, 289, 289, 282, 243, 194, 158, 130, 115, 112, 105, 67, 54, 54, 14, 12]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5097548961639404 seconds
Jaccard graph constructed in 0.4959716796875 seconds
Wrote graph to binary file in 0.16306304931640625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876634
After 2 runs, maximum modularity is Q = 0.878518
Louvain completed 22 runs in 1.2712428569793701 seconds
PhenoGraph complete in 2.455409526824951 seconds
Found communities [-1, ... 18], with sizes: [63, 643, 543, 482, 335, 310, 272, 258, 209, 134, 128, 119, 115, 96, 90, 77, 67, 60, 17, 13]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5097231864929199 seconds
Jaccard graph constructed in 0.47243189811706543 seconds
Wrote graph to binary file in 0.18134784698486328 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877117
After 4 runs, maximum modularity is Q = 0.878692
Louvain completed 24 runs in 1.3584632873535156 seconds
PhenoGraph complete in 2.536989450454712 seconds
Found communities [-1, ... 19], with sizes: [70, 508, 489, 384, 335, 328, 328, 303, 261, 222, 174, 157, 117, 95, 59, 57, 55, 37, 24, 15, 13]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.509711742401123 seconds
Jaccard graph constructed in 0.4906728267669678 seconds
Wrote graph to binary file in 0.18431615829467773 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874887
After 2 runs, maximum modularity is Q = 0.877135
After 14 runs, maximum modularity is Q = 0.878304
Louvain completed 34 runs in 1.9109795093536377 seconds
PhenoGraph complete in 3.111431121826172 seconds
Found communities [-1, ... 21], with sizes: [73, 471, 432, 415, 398, 330, 258, 253, 241, 233, 143, 132, 119, 117, 79, 68, 57, 56, 54, 53, 21, 15, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5172734260559082 seconds
Jaccard graph constructed in 0.47579026222229004 seconds
Wrote graph to binary file in 0.07378578186035156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.879685
Louvain completed 21 runs in 1.0910844802856445 seconds
PhenoGraph complete in 2.1752805709838867 seconds
Found communities [-1, ... 20], with sizes: [54, 477, 477, 437, 380, 323, 284, 277, 233, 142, 138, 132, 129, 107, 98, 86, 63, 60, 57, 52, 13, 12]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5105595588684082 seconds
Jaccard graph constructed in 0.48531103134155273 seconds
Wrote graph to binary file in 0.18321013450622559 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874996
After 2 runs, maximum modularity is Q = 0.876336
After 3 runs, maximum modularity is Q = 0.877998
Louvain completed 23 runs in 1.5135843753814697 seconds
PhenoGraph complete in 2.7096638679504395 seconds
Found communities [-1, ... 19], with sizes: [59, 467, 458, 422, 324, 307, 296, 276, 272, 194, 188, 178, 138, 102, 77, 72, 60, 58, 55, 17, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5126919746398926 seconds
Jaccard graph constructed in 0.4905238151550293 seconds
Wrote graph to binary file in 0.17134785652160645 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878769
Louvain completed 21 runs in 1.047325849533081 seconds
PhenoGraph complete in 2.2437732219696045 seconds
Found communities [-1, ... 17], with sizes: [78, 473, 441, 418, 318, 309, 305, 279, 258, 243, 188, 156, 138, 97, 91, 90, 76, 58, 15]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5183579921722412 seconds
Jaccard graph constructed in 0.4668898582458496 seconds
Wrote graph to binary file in 0.16524100303649902 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878542
After 2 runs, maximum modularity is Q = 0.879898
After 4 runs, maximum modularity is Q = 0.881055
After 5 runs, maximum modularity is Q = 0.88249
Louvain completed 25 runs in 1.5979628562927246 seconds
PhenoGraph complete in 2.766202688217163 seconds
Found communities [-1, ... 19], with sizes: [71, 501, 467, 314, 314, 285, 283, 262, 261, 255, 208, 207, 144, 96, 77, 76, 67, 67, 50, 13, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5119054317474365 seconds
Jaccard graph constructed in 0.49642109870910645 seconds
Wrote graph to binary file in 0.19015765190124512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877089
After 8 runs, maximum modularity is Q = 0.878562
Louvain completed 28 runs in 1.5239088535308838 seconds
PhenoGraph complete in 2.7475368976593018 seconds
Found communities [-1, ... 18], with sizes: [57, 682, 475, 356, 333, 271, 267, 233, 226, 207, 193, 166, 129, 121, 79, 67, 59, 55, 41, 14]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5094122886657715 seconds
Jaccard graph constructed in 0.49228978157043457 seconds
Wrote graph to binary file in 0.07393264770507812 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87725
After 3 runs, maximum modularity is Q = 0.878287
Louvain completed 23 runs in 1.330054521560669 seconds
PhenoGraph complete in 2.421684980392456 seconds
Found communities [-1, ... 19], with sizes: [70, 511, 425, 404, 349, 325, 306, 295, 271, 247, 189, 139, 100, 71, 70, 69, 62, 56, 47, 14, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5103309154510498 seconds
Jaccard graph constructed in 0.4899752140045166 seconds
Wrote graph to binary file in 0.1648874282836914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877486
After 9 runs, maximum modularity is Q = 0.878707
Louvain completed 29 runs in 1.5609664916992188 seconds
PhenoGraph complete in 2.74412202835083 seconds
Found communities [-1, ... 19], with sizes: [68, 458, 440, 371, 369, 349, 278, 267, 246, 240, 193, 130, 119, 114, 75, 73, 70, 60, 54, 44, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5118117332458496 seconds
Jaccard graph constructed in 0.48421311378479004 seconds
Wrote graph to binary file in 0.18859124183654785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878439
After 3 runs, maximum modularity is Q = 0.880161
Louvain completed 23 runs in 1.2550854682922363 seconds
PhenoGraph complete in 2.455286979675293 seconds
Found communities [-1, ... 20], with sizes: [55, 486, 470, 358, 323, 296, 284, 253, 243, 211, 171, 146, 132, 131, 101, 75, 74, 69, 66, 61, 13, 13]

In [36]:
sc.pp.normalize_per_cell(D345_Biop_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D345_Biop_Nas1) # log transform the data
D345_Biop_Nas1.raw = D345_Biop_Nas1 # freeze the object (for later use of the raw state of it)
In [37]:
D345_Biop_Nas1 = D345_Biop_Nas1[:, D345_Biop_Nas1.var['ribo_genes']]
D345_Biop_Nas1
Out[37]:
View of AnnData object with n_obs × n_vars = 3225 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

Nasal Brushings

Back to top

In [38]:
D353_Brus_Nas1 = sc.read_10x_mtx(
    './D353_Brus_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D353_Brus_Nas1.var_names_make_unique()
D353_Brus_Nas1.obs['manip'] = 'D353_Brus_Nas1'
D353_Brus_Nas1.obs['position'] = 'Nasal'
D353_Brus_Nas1.obs['method'] = 'Brushing'
D353_Brus_Nas1.obs['donor'] = 'D353'
D353_Brus_Nas1.obs['name'] = ['D353_Brus_Nas1_' + s for s in list(D353_Brus_Nas1.obs.index)]
D353_Brus_Nas1.obs_names = D353_Brus_Nas1.obs['name']
D353_Brus_Nas1
... reading from cache file ./cache/D353_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[38]:
AnnData object with n_obs × n_vars = 5154 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [39]:
sc.pl.highest_expr_genes(D353_Brus_Nas1, n_top=20)
In [40]:
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=0)
mito_genes = D353_Brus_Nas1.var_names.str.startswith('MT-')
D353_Brus_Nas1.obs['percent_mito'] = np.sum(
    D353_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.obs['n_counts'] = D353_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Nas1.to_df())
ribo_genes = D353_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Nas1.obs['percent_ribo'] = np.sum(
    D353_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Nas1.X, axis=1).A1
D353_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D353_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [41]:
sc.pp.filter_cells(D353_Brus_Nas1, min_genes=500)
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['n_counts'] < 40000, :]
D353_Brus_Nas1 = D353_Brus_Nas1[D353_Brus_Nas1.obs['percent_mito'] < 0.5, :]
filtered out 3 cells that have less than 500 genes expressed
In [42]:
# scrublet
scrub = scr.Scrublet(D353_Brus_Nas1.X, expected_doublet_rate=0.04)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D353_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.47
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 10.8%
Overall doublet rate:
	Expected   = 4.0%
	Estimated  = 0.9%
Elapsed time: 5.6 seconds
Out[42]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecb02bc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecafefd30>],
       dtype=object))
In [43]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Brus_Nas1.X).predict()
D353_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2127726078033447 seconds
Jaccard graph constructed in 0.7197434902191162 seconds
Wrote graph to binary file in 0.25298023223876953 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909195
After 2 runs, maximum modularity is Q = 0.911025
Louvain completed 22 runs in 1.7851529121398926 seconds
PhenoGraph complete in 3.994328022003174 seconds
Found communities [-1, ... 26], with sizes: [42, 780, 676, 492, 439, 415, 408, 329, 310, 298, 291, 286, 216, 207, 155, 153, 130, 120, 115, 100, 98, 70, 69, 65, 51, 41, 36, 21]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.7255289554595947 seconds
Jaccard graph constructed in 0.927344560623169 seconds
Wrote graph to binary file in 0.24228453636169434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910609
Louvain completed 21 runs in 1.581740379333496 seconds
PhenoGraph complete in 4.498773574829102 seconds
Found communities [-1, ... 25], with sizes: [51, 999, 539, 471, 452, 436, 387, 315, 309, 298, 269, 267, 243, 215, 174, 148, 148, 144, 102, 87, 79, 69, 64, 53, 36, 29, 29]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2243099212646484 seconds
Jaccard graph constructed in 0.6852452754974365 seconds
Wrote graph to binary file in 0.24946808815002441 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910357
Louvain completed 21 runs in 1.5184452533721924 seconds
PhenoGraph complete in 3.697181463241577 seconds
Found communities [-1, ... 28], with sizes: [61, 606, 479, 471, 436, 429, 417, 367, 332, 295, 292, 283, 215, 213, 208, 205, 186, 156, 145, 91, 90, 73, 69, 61, 60, 52, 39, 36, 28, 18]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2142560482025146 seconds
Jaccard graph constructed in 0.6664714813232422 seconds
Wrote graph to binary file in 0.23502373695373535 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909123
After 2 runs, maximum modularity is Q = 0.911097
Louvain completed 22 runs in 1.88018798828125 seconds
PhenoGraph complete in 4.01696252822876 seconds
Found communities [-1, ... 28], with sizes: [49, 710, 629, 550, 488, 403, 388, 308, 306, 295, 285, 263, 261, 233, 182, 155, 141, 136, 90, 73, 70, 68, 65, 55, 52, 44, 36, 34, 29, 15]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2145659923553467 seconds
Jaccard graph constructed in 0.8374745845794678 seconds
Wrote graph to binary file in 0.21868538856506348 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910927
Louvain completed 21 runs in 1.5645432472229004 seconds
PhenoGraph complete in 3.8568105697631836 seconds
Found communities [-1, ... 25], with sizes: [38, 925, 796, 689, 562, 430, 322, 271, 249, 231, 216, 201, 191, 172, 169, 145, 130, 110, 90, 90, 87, 70, 58, 57, 42, 36, 36]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.214392900466919 seconds
Jaccard graph constructed in 0.6968972682952881 seconds
Wrote graph to binary file in 0.24641752243041992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909486
Louvain completed 21 runs in 1.521557331085205 seconds
PhenoGraph complete in 3.7022786140441895 seconds
Found communities [-1, ... 26], with sizes: [37, 1023, 743, 450, 431, 400, 342, 304, 284, 238, 236, 223, 213, 206, 188, 180, 149, 144, 114, 90, 86, 70, 67, 56, 42, 37, 37, 23]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2143833637237549 seconds
Jaccard graph constructed in 0.737191915512085 seconds
Wrote graph to binary file in 0.2756507396697998 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911245
Louvain completed 21 runs in 1.66801118850708 seconds
PhenoGraph complete in 3.927262783050537 seconds
Found communities [-1, ... 27], with sizes: [45, 1077, 539, 453, 436, 410, 393, 328, 307, 307, 268, 256, 218, 211, 169, 147, 130, 109, 99, 91, 70, 64, 57, 53, 50, 39, 36, 28, 23]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2155818939208984 seconds
Jaccard graph constructed in 0.836904764175415 seconds
Wrote graph to binary file in 0.22152042388916016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909791
After 12 runs, maximum modularity is Q = 0.910807
Louvain completed 32 runs in 2.4536595344543457 seconds
PhenoGraph complete in 4.7473509311676025 seconds
Found communities [-1, ... 29], with sizes: [52, 876, 626, 487, 417, 408, 391, 300, 275, 264, 220, 219, 219, 194, 184, 173, 156, 151, 142, 92, 88, 76, 70, 65, 57, 51, 48, 36, 26, 25, 25]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2221226692199707 seconds
Jaccard graph constructed in 0.740626335144043 seconds
Wrote graph to binary file in 0.3226799964904785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911461
Louvain completed 21 runs in 1.550100564956665 seconds
PhenoGraph complete in 3.8605575561523438 seconds
Found communities [-1, ... 27], with sizes: [43, 970, 605, 522, 467, 410, 316, 306, 301, 285, 281, 244, 244, 185, 155, 143, 129, 127, 100, 91, 76, 71, 70, 68, 54, 51, 37, 36, 26]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2145960330963135 seconds
Jaccard graph constructed in 0.7350647449493408 seconds
Wrote graph to binary file in 0.262676477432251 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906941
After 5 runs, maximum modularity is Q = 0.908376
Louvain completed 25 runs in 2.062347650527954 seconds
PhenoGraph complete in 4.295223236083984 seconds
Found communities [-1, ... 24], with sizes: [54, 1001, 686, 550, 549, 391, 337, 334, 294, 284, 274, 238, 222, 203, 155, 141, 135, 101, 94, 91, 68, 67, 52, 40, 39, 13]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2217864990234375 seconds
Jaccard graph constructed in 0.7268059253692627 seconds
Wrote graph to binary file in 0.24543118476867676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907168
After 3 runs, maximum modularity is Q = 0.908238
Louvain completed 23 runs in 1.8657281398773193 seconds
PhenoGraph complete in 4.082415819168091 seconds
Found communities [-1, ... 26], with sizes: [42, 761, 632, 631, 467, 418, 362, 356, 271, 262, 262, 259, 231, 222, 204, 143, 139, 119, 101, 86, 85, 74, 71, 67, 53, 39, 39, 17]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2191917896270752 seconds
Jaccard graph constructed in 0.8307540416717529 seconds
Wrote graph to binary file in 0.21717500686645508 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908685
After 5 runs, maximum modularity is Q = 0.909771
Louvain completed 25 runs in 1.980724573135376 seconds
PhenoGraph complete in 4.268359899520874 seconds
Found communities [-1, ... 28], with sizes: [62, 830, 716, 523, 424, 400, 312, 287, 265, 258, 252, 244, 234, 221, 212, 151, 139, 130, 118, 99, 90, 72, 69, 65, 64, 53, 42, 36, 25, 20]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.21579909324646 seconds
Jaccard graph constructed in 0.7258813381195068 seconds
Wrote graph to binary file in 0.24634504318237305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909735
Louvain completed 21 runs in 1.53537917137146 seconds
PhenoGraph complete in 3.7452125549316406 seconds
Found communities [-1, ... 28], with sizes: [48, 648, 632, 494, 433, 411, 397, 317, 307, 292, 286, 248, 227, 222, 216, 202, 152, 137, 128, 90, 87, 70, 69, 62, 55, 52, 50, 36, 23, 22]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2153537273406982 seconds
Jaccard graph constructed in 0.7558958530426025 seconds
Wrote graph to binary file in 0.24044060707092285 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908969
Louvain completed 21 runs in 1.5639097690582275 seconds
PhenoGraph complete in 3.8045454025268555 seconds
Found communities [-1, ... 27], with sizes: [48, 899, 628, 546, 484, 417, 306, 293, 292, 270, 243, 240, 224, 215, 188, 187, 154, 116, 93, 91, 89, 75, 70, 64, 53, 44, 36, 34, 14]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.214221477508545 seconds
Jaccard graph constructed in 0.8624017238616943 seconds
Wrote graph to binary file in 0.21771502494812012 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910122
Louvain completed 21 runs in 1.6675090789794922 seconds
PhenoGraph complete in 3.9836511611938477 seconds
Found communities [-1, ... 25], with sizes: [61, 964, 700, 549, 469, 469, 381, 368, 275, 251, 230, 214, 206, 200, 157, 153, 128, 104, 94, 85, 70, 67, 57, 54, 43, 37, 27]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2226972579956055 seconds
Jaccard graph constructed in 0.7140727043151855 seconds
Wrote graph to binary file in 0.24596619606018066 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909802
Louvain completed 21 runs in 1.6083636283874512 seconds
PhenoGraph complete in 3.81315541267395 seconds
Found communities [-1, ... 26], with sizes: [35, 974, 779, 712, 516, 402, 332, 316, 295, 275, 217, 196, 191, 152, 138, 137, 112, 101, 86, 77, 72, 64, 62, 54, 42, 36, 29, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2138848304748535 seconds
Jaccard graph constructed in 0.7432422637939453 seconds
Wrote graph to binary file in 0.24086976051330566 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910733
Louvain completed 21 runs in 1.567589282989502 seconds
PhenoGraph complete in 3.7843167781829834 seconds
Found communities [-1, ... 26], with sizes: [49, 931, 757, 551, 466, 390, 358, 302, 279, 236, 233, 224, 209, 199, 196, 161, 152, 116, 94, 89, 72, 71, 62, 62, 55, 38, 36, 25]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2250697612762451 seconds
Jaccard graph constructed in 0.9474897384643555 seconds
Wrote graph to binary file in 0.23464679718017578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907892
After 5 runs, maximum modularity is Q = 0.909824
Louvain completed 25 runs in 2.123487949371338 seconds
PhenoGraph complete in 4.55275559425354 seconds
Found communities [-1, ... 26], with sizes: [36, 967, 556, 491, 456, 431, 405, 337, 308, 304, 296, 260, 235, 166, 159, 136, 120, 118, 114, 92, 74, 69, 65, 56, 48, 46, 37, 31]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2245111465454102 seconds
Jaccard graph constructed in 0.7266659736633301 seconds
Wrote graph to binary file in 0.2795066833496094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909389
Louvain completed 21 runs in 1.5866949558258057 seconds
PhenoGraph complete in 3.8383898735046387 seconds
Found communities [-1, ... 27], with sizes: [55, 774, 595, 513, 488, 419, 405, 308, 307, 285, 240, 236, 235, 233, 168, 156, 148, 124, 117, 102, 90, 70, 69, 60, 58, 52, 44, 36, 26]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3145020008087158 seconds
Jaccard graph constructed in 0.7366433143615723 seconds
Wrote graph to binary file in 0.2247462272644043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912161
After 2 runs, maximum modularity is Q = 0.913311
Louvain completed 22 runs in 1.7416725158691406 seconds
PhenoGraph complete in 4.041547060012817 seconds
Found communities [-1, ... 28], with sizes: [46, 976, 546, 492, 453, 411, 360, 325, 287, 274, 231, 229, 217, 206, 175, 153, 149, 145, 117, 91, 75, 69, 69, 64, 63, 53, 37, 37, 36, 27]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2202198505401611 seconds
Jaccard graph constructed in 0.7133526802062988 seconds
Wrote graph to binary file in 0.24982571601867676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909881
Louvain completed 21 runs in 1.5742614269256592 seconds
PhenoGraph complete in 3.7793445587158203 seconds
Found communities [-1, ... 28], with sizes: [51, 636, 573, 555, 441, 380, 326, 314, 305, 296, 276, 219, 216, 207, 200, 197, 157, 150, 140, 116, 96, 91, 88, 85, 73, 60, 54, 44, 36, 31]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2174968719482422 seconds
Jaccard graph constructed in 0.8321328163146973 seconds
Wrote graph to binary file in 0.21767473220825195 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911118
After 4 runs, maximum modularity is Q = 0.912837
Louvain completed 24 runs in 1.9395034313201904 seconds
PhenoGraph complete in 4.227894306182861 seconds
Found communities [-1, ... 26], with sizes: [39, 1016, 498, 494, 468, 461, 387, 331, 297, 293, 284, 242, 215, 196, 152, 138, 128, 125, 98, 91, 83, 76, 75, 69, 56, 41, 37, 23]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2213521003723145 seconds
Jaccard graph constructed in 0.7391245365142822 seconds
Wrote graph to binary file in 0.24411273002624512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909974
After 7 runs, maximum modularity is Q = 0.911852
Louvain completed 27 runs in 2.086089611053467 seconds
PhenoGraph complete in 4.313715934753418 seconds
Found communities [-1, ... 27], with sizes: [39, 853, 520, 455, 453, 444, 416, 370, 318, 291, 287, 268, 243, 210, 172, 156, 151, 129, 102, 91, 82, 69, 62, 54, 51, 36, 34, 29, 28]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2173089981079102 seconds
Jaccard graph constructed in 0.7325339317321777 seconds
Wrote graph to binary file in 0.24312448501586914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908168
After 2 runs, maximum modularity is Q = 0.909446
Louvain completed 22 runs in 1.746293306350708 seconds
PhenoGraph complete in 3.960584878921509 seconds
Found communities [-1, ... 27], with sizes: [60, 833, 529, 528, 524, 435, 422, 306, 297, 224, 223, 218, 217, 212, 161, 151, 146, 138, 112, 102, 96, 91, 91, 64, 54, 53, 45, 45, 36]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2166056632995605 seconds
Jaccard graph constructed in 0.7211313247680664 seconds
Wrote graph to binary file in 0.2405107021331787 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910063
After 11 runs, maximum modularity is Q = 0.911116
Louvain completed 31 runs in 2.39713978767395 seconds
PhenoGraph complete in 4.596807241439819 seconds
Found communities [-1, ... 25], with sizes: [49, 804, 790, 456, 403, 395, 375, 343, 320, 312, 241, 224, 215, 212, 206, 162, 156, 134, 115, 91, 90, 74, 59, 55, 52, 44, 36]

In [44]:
sc.pp.normalize_per_cell(D353_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Nas1) # log transform the data
D353_Brus_Nas1.raw = D353_Brus_Nas1 # freeze the object (for later use of the raw state of it)
In [45]:
D353_Brus_Nas1 = D353_Brus_Nas1[:, D353_Brus_Nas1.var['ribo_genes']]
D353_Brus_Nas1
Out[45]:
View of AnnData object with n_obs × n_vars = 5131 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [46]:
D363_Brus_Nas1 = sc.read_10x_mtx(
    './D363_Brus_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D363_Brus_Nas1.var_names_make_unique()
D363_Brus_Nas1.obs['manip'] = 'D363_Brus_Nas1'
D363_Brus_Nas1.obs['position'] = 'Nasal'
D363_Brus_Nas1.obs['method'] = 'Brushing'
D363_Brus_Nas1.obs['donor'] = 'D363'
D363_Brus_Nas1.obs['name'] = ['D363_Brus_Nas1_' + s for s in list(D363_Brus_Nas1.obs.index)]
D363_Brus_Nas1.obs_names = D363_Brus_Nas1.obs['name']
D363_Brus_Nas1
... reading from cache file ./cache/D363_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[46]:
AnnData object with n_obs × n_vars = 3505 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [47]:
sc.pl.highest_expr_genes(D363_Brus_Nas1, n_top=20)
In [48]:
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=0)
mito_genes = D363_Brus_Nas1.var_names.str.startswith('MT-')
D363_Brus_Nas1.obs['percent_mito'] = np.sum(
    D363_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.obs['n_counts'] = D363_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Nas1.to_df())
ribo_genes = D363_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Nas1.obs['percent_ribo'] = np.sum(
    D363_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Nas1.X, axis=1).A1
D363_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D363_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [49]:
sc.pp.filter_cells(D363_Brus_Nas1, min_genes=500)
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['n_counts'] < 30000, :]
D363_Brus_Nas1 = D363_Brus_Nas1[D363_Brus_Nas1.obs['percent_mito'] < 0.5, :]
filtered out 1 cells that have less than 500 genes expressed
In [50]:
# scrublet
scrub = scr.Scrublet(D363_Brus_Nas1.X, expected_doublet_rate=0.027)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D363_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.34
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 6.1%
Overall doublet rate:
	Expected   = 2.7%
	Estimated  = 2.8%
Elapsed time: 3.1 seconds
Out[50]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb88855f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbfc75c0>],
       dtype=object))
In [51]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Brus_Nas1.X).predict()
D363_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.011885166168213 seconds
Jaccard graph constructed in 0.5874404907226562 seconds
Wrote graph to binary file in 0.21593761444091797 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895554
After 2 runs, maximum modularity is Q = 0.896566
Louvain completed 22 runs in 1.470625638961792 seconds
PhenoGraph complete in 3.3034350872039795 seconds
Found communities [-1, ... 20], with sizes: [93, 554, 551, 513, 398, 285, 271, 256, 205, 167, 166, 141, 136, 106, 92, 86, 85, 83, 80, 52, 27, 15]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0164003372192383 seconds
Jaccard graph constructed in 0.542255163192749 seconds
Wrote graph to binary file in 0.188262939453125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897149
After 7 runs, maximum modularity is Q = 0.898404
Louvain completed 27 runs in 1.6902661323547363 seconds
PhenoGraph complete in 3.4556806087493896 seconds
Found communities [-1, ... 21], with sizes: [115, 531, 489, 449, 394, 357, 325, 288, 230, 168, 163, 145, 144, 89, 85, 80, 79, 78, 52, 42, 26, 20, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9105620384216309 seconds
Jaccard graph constructed in 0.5442867279052734 seconds
Wrote graph to binary file in 0.1835627555847168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892773
After 16 runs, maximum modularity is Q = 0.89382
Louvain completed 36 runs in 2.1753244400024414 seconds
PhenoGraph complete in 3.8374288082122803 seconds
Found communities [-1, ... 20], with sizes: [89, 594, 547, 507, 505, 383, 278, 236, 173, 166, 142, 131, 93, 92, 88, 83, 79, 73, 52, 26, 13, 12]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.4136576652526855 seconds
Jaccard graph constructed in 0.5412983894348145 seconds
Wrote graph to binary file in 0.20117902755737305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895441
After 6 runs, maximum modularity is Q = 0.896495
Louvain completed 26 runs in 1.629683017730713 seconds
PhenoGraph complete in 3.8024487495422363 seconds
Found communities [-1, ... 18], with sizes: [103, 593, 550, 504, 390, 315, 306, 257, 233, 199, 178, 128, 114, 88, 88, 86, 77, 75, 52, 26]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.011387586593628 seconds
Jaccard graph constructed in 0.5407369136810303 seconds
Wrote graph to binary file in 0.07936930656433105 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894018
After 4 runs, maximum modularity is Q = 0.895078
Louvain completed 24 runs in 1.5311081409454346 seconds
PhenoGraph complete in 3.178654670715332 seconds
Found communities [-1, ... 18], with sizes: [94, 740, 529, 525, 401, 328, 263, 262, 214, 199, 173, 145, 86, 82, 79, 78, 72, 52, 27, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9134063720703125 seconds
Jaccard graph constructed in 0.5180704593658447 seconds
Wrote graph to binary file in 0.19541668891906738 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891901
After 5 runs, maximum modularity is Q = 0.892985
Louvain completed 25 runs in 1.6027026176452637 seconds
PhenoGraph complete in 3.2467846870422363 seconds
Found communities [-1, ... 18], with sizes: [92, 734, 553, 548, 518, 373, 256, 242, 211, 136, 136, 103, 88, 87, 72, 64, 52, 51, 28, 18]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.4157562255859375 seconds
Jaccard graph constructed in 0.571800708770752 seconds
Wrote graph to binary file in 0.1983933448791504 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892291
After 3 runs, maximum modularity is Q = 0.894405
Louvain completed 23 runs in 1.4698371887207031 seconds
PhenoGraph complete in 3.675712823867798 seconds
Found communities [-1, ... 17], with sizes: [109, 738, 567, 562, 555, 299, 263, 229, 196, 150, 135, 86, 82, 82, 80, 80, 69, 52, 28]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9214785099029541 seconds
Jaccard graph constructed in 0.5363421440124512 seconds
Wrote graph to binary file in 0.17616724967956543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892314
Louvain completed 21 runs in 1.248786449432373 seconds
PhenoGraph complete in 2.9004197120666504 seconds
Found communities [-1, ... 20], with sizes: [99, 617, 528, 517, 444, 296, 262, 253, 205, 171, 165, 153, 143, 91, 87, 85, 73, 69, 52, 27, 14, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9119851589202881 seconds
Jaccard graph constructed in 0.52254319190979 seconds
Wrote graph to binary file in 0.18080687522888184 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893897
After 2 runs, maximum modularity is Q = 0.895141
Louvain completed 22 runs in 1.4266157150268555 seconds
PhenoGraph complete in 3.0582773685455322 seconds
Found communities [-1, ... 18], with sizes: [94, 567, 555, 507, 303, 299, 255, 252, 239, 236, 215, 204, 131, 100, 89, 86, 78, 74, 52, 26]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9102835655212402 seconds
Jaccard graph constructed in 0.5288314819335938 seconds
Wrote graph to binary file in 0.07883858680725098 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895046
Louvain completed 21 runs in 1.214411735534668 seconds
PhenoGraph complete in 2.7481415271759033 seconds
Found communities [-1, ... 18], with sizes: [96, 531, 525, 509, 463, 338, 271, 219, 217, 199, 193, 161, 143, 94, 87, 85, 81, 72, 52, 26]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3104100227355957 seconds
Jaccard graph constructed in 0.5491280555725098 seconds
Wrote graph to binary file in 0.08621072769165039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891554
Louvain completed 21 runs in 1.3808362483978271 seconds
PhenoGraph complete in 3.344397783279419 seconds
Found communities [-1, ... 19], with sizes: [96, 542, 525, 506, 497, 402, 269, 260, 216, 189, 153, 134, 90, 88, 86, 81, 77, 57, 52, 26, 16]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0127100944519043 seconds
Jaccard graph constructed in 0.6711044311523438 seconds
Wrote graph to binary file in 0.07966256141662598 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89389
After 7 runs, maximum modularity is Q = 0.895132
Louvain completed 27 runs in 1.7397046089172363 seconds
PhenoGraph complete in 3.520399570465088 seconds
Found communities [-1, ... 17], with sizes: [70, 686, 565, 540, 529, 389, 267, 238, 221, 136, 134, 97, 88, 86, 83, 81, 75, 52, 25]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.913926362991333 seconds
Jaccard graph constructed in 0.5575425624847412 seconds
Wrote graph to binary file in 0.1962299346923828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891216
Louvain completed 21 runs in 1.2624707221984863 seconds
PhenoGraph complete in 2.947330951690674 seconds
Found communities [-1, ... 18], with sizes: [87, 715, 554, 553, 429, 326, 271, 208, 202, 194, 175, 136, 94, 88, 87, 80, 68, 53, 26, 16]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.4119586944580078 seconds
Jaccard graph constructed in 0.5575528144836426 seconds
Wrote graph to binary file in 0.1961507797241211 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893882
After 7 runs, maximum modularity is Q = 0.895206
Louvain completed 27 runs in 1.6580243110656738 seconds
PhenoGraph complete in 3.8415257930755615 seconds
Found communities [-1, ... 19], with sizes: [98, 566, 545, 512, 399, 311, 274, 260, 195, 192, 186, 185, 138, 94, 86, 83, 74, 67, 52, 25, 20]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0112378597259521 seconds
Jaccard graph constructed in 0.5901081562042236 seconds
Wrote graph to binary file in 0.20053887367248535 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895551
Louvain completed 21 runs in 1.2312848567962646 seconds
PhenoGraph complete in 3.048438787460327 seconds
Found communities [-1, ... 18], with sizes: [105, 542, 510, 389, 375, 354, 340, 325, 286, 198, 154, 148, 142, 89, 87, 86, 77, 76, 53, 26]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.4124422073364258 seconds
Jaccard graph constructed in 0.5533256530761719 seconds
Wrote graph to binary file in 0.2122361660003662 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894506
Louvain completed 21 runs in 1.266465425491333 seconds
PhenoGraph complete in 3.4707083702087402 seconds
Found communities [-1, ... 19], with sizes: [100, 611, 592, 560, 530, 272, 267, 184, 184, 170, 163, 144, 88, 86, 85, 80, 78, 75, 52, 26, 15]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.91042160987854 seconds
Jaccard graph constructed in 0.544696569442749 seconds
Wrote graph to binary file in 0.20070505142211914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896103
After 2 runs, maximum modularity is Q = 0.897147
Louvain completed 22 runs in 1.4633309841156006 seconds
PhenoGraph complete in 3.1344850063323975 seconds
Found communities [-1, ... 19], with sizes: [105, 594, 586, 581, 498, 310, 251, 215, 188, 180, 173, 128, 92, 86, 83, 80, 69, 54, 33, 30, 26]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0117473602294922 seconds
Jaccard graph constructed in 0.528874397277832 seconds
Wrote graph to binary file in 0.07802391052246094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893695
Louvain completed 21 runs in 1.224231481552124 seconds
PhenoGraph complete in 2.8586387634277344 seconds
Found communities [-1, ... 19], with sizes: [104, 581, 495, 448, 370, 360, 298, 262, 258, 215, 171, 161, 130, 111, 90, 85, 85, 54, 39, 26, 19]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7101707458496094 seconds
Jaccard graph constructed in 0.5220203399658203 seconds
Wrote graph to binary file in 0.1974945068359375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892453
After 9 runs, maximum modularity is Q = 0.893926
Louvain completed 29 runs in 1.802060604095459 seconds
PhenoGraph complete in 3.2479052543640137 seconds
Found communities [-1, ... 19], with sizes: [83, 749, 553, 537, 499, 376, 254, 196, 184, 144, 144, 103, 87, 82, 80, 79, 72, 53, 47, 26, 14]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9182157516479492 seconds
Jaccard graph constructed in 0.5311503410339355 seconds
Wrote graph to binary file in 0.19744205474853516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895217
After 5 runs, maximum modularity is Q = 0.896632
Louvain completed 25 runs in 1.5544304847717285 seconds
PhenoGraph complete in 3.216425657272339 seconds
Found communities [-1, ... 19], with sizes: [100, 553, 545, 514, 466, 329, 297, 216, 195, 176, 162, 131, 105, 93, 88, 86, 78, 77, 73, 52, 26]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8139538764953613 seconds
Jaccard graph constructed in 0.5388381481170654 seconds
Wrote graph to binary file in 0.2006535530090332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891788
After 3 runs, maximum modularity is Q = 0.892803
Louvain completed 23 runs in 1.5210297107696533 seconds
PhenoGraph complete in 3.0938730239868164 seconds
Found communities [-1, ... 18], with sizes: [75, 742, 538, 519, 510, 420, 287, 182, 154, 150, 149, 143, 86, 84, 82, 77, 72, 52, 27, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.417912244796753 seconds
Jaccard graph constructed in 0.5502862930297852 seconds
Wrote graph to binary file in 0.2003021240234375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895648
Louvain completed 21 runs in 1.2384800910949707 seconds
PhenoGraph complete in 3.423154592514038 seconds
Found communities [-1, ... 20], with sizes: [116, 573, 573, 520, 452, 375, 237, 236, 199, 188, 157, 124, 98, 86, 86, 84, 81, 61, 53, 27, 22, 14]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9103193283081055 seconds
Jaccard graph constructed in 0.5651466846466064 seconds
Wrote graph to binary file in 0.07817745208740234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893139
Louvain completed 21 runs in 1.2251660823822021 seconds
PhenoGraph complete in 2.7935497760772705 seconds
Found communities [-1, ... 18], with sizes: [96, 559, 549, 509, 491, 348, 257, 233, 211, 188, 154, 141, 127, 89, 87, 85, 80, 79, 53, 26]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7102034091949463 seconds
Jaccard graph constructed in 0.5361120700836182 seconds
Wrote graph to binary file in 0.19938349723815918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892193
After 5 runs, maximum modularity is Q = 0.89363
Louvain completed 25 runs in 1.562617540359497 seconds
PhenoGraph complete in 3.0256247520446777 seconds
Found communities [-1, ... 17], with sizes: [103, 543, 531, 495, 451, 355, 344, 280, 243, 197, 196, 131, 91, 86, 84, 80, 74, 52, 26]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0116286277770996 seconds
Jaccard graph constructed in 0.5348474979400635 seconds
Wrote graph to binary file in 0.18000102043151855 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894301
Louvain completed 21 runs in 1.299785852432251 seconds
PhenoGraph complete in 3.0412437915802 seconds
Found communities [-1, ... 18], with sizes: [96, 577, 497, 487, 454, 310, 308, 283, 276, 190, 138, 135, 131, 90, 86, 81, 74, 70, 52, 27]

In [52]:
sc.pp.normalize_per_cell(D363_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Nas1) # log transform the data
D363_Brus_Nas1.raw = D363_Brus_Nas1 # freeze the object (for later use of the raw state of it)
In [53]:
D363_Brus_Nas1 = D363_Brus_Nas1[:, D363_Brus_Nas1.var['ribo_genes']]
D363_Brus_Nas1
Out[53]:
View of AnnData object with n_obs × n_vars = 3490 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [54]:
D367_Brus_Nas1 = sc.read_10x_mtx(
    './D367_Brus_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D367_Brus_Nas1.var_names_make_unique()
D367_Brus_Nas1.obs['manip'] = 'D367_Brus_Nas1'
D367_Brus_Nas1.obs['position'] = 'Nasal'
D367_Brus_Nas1.obs['method'] = 'Brushing'
D367_Brus_Nas1.obs['donor'] = 'D367'
D367_Brus_Nas1.obs['name'] = ['D367_Brus_Nas1_' + s for s in list(D367_Brus_Nas1.obs.index)]
D367_Brus_Nas1.obs_names = D367_Brus_Nas1.obs['name']
D367_Brus_Nas1
... reading from cache file ./cache/D367_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[54]:
AnnData object with n_obs × n_vars = 2596 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [55]:
sc.pl.highest_expr_genes(D367_Brus_Nas1, n_top=20)
In [56]:
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=0)
mito_genes = D367_Brus_Nas1.var_names.str.startswith('MT-')
D367_Brus_Nas1.obs['percent_mito'] = np.sum(
    D367_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.obs['n_counts'] = D367_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Nas1.to_df())
ribo_genes = D367_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Nas1.obs['percent_ribo'] = np.sum(
    D367_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Nas1.X, axis=1).A1
D367_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D367_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [57]:
sc.pp.filter_cells(D367_Brus_Nas1, min_genes=500)
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['n_counts'] < 30000, :]
D367_Brus_Nas1 = D367_Brus_Nas1[D367_Brus_Nas1.obs['percent_mito'] < 0.5, :]
In [58]:
# scrublet
scrub = scr.Scrublet(D367_Brus_Nas1.X, expected_doublet_rate=0.02)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D367_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.24
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 12.8%
Overall doublet rate:
	Expected   = 2.0%
	Estimated  = 2.1%
Elapsed time: 2.3 seconds
Out[58]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ec3f50438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ec3f0bc18>],
       dtype=object))
In [59]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Brus_Nas1.X).predict()
D367_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5084607601165771 seconds
Jaccard graph constructed in 0.4668412208557129 seconds
Wrote graph to binary file in 0.1636643409729004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900197
After 4 runs, maximum modularity is Q = 0.902022
Louvain completed 24 runs in 1.3420603275299072 seconds
PhenoGraph complete in 2.4956376552581787 seconds
Found communities [-1, ... 20], with sizes: [127, 563, 345, 302, 292, 222, 208, 171, 136, 119, 113, 98, 89, 73, 71, 67, 61, 53, 52, 33, 29, 14]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.311171293258667 seconds
Jaccard graph constructed in 0.4383265972137451 seconds
Wrote graph to binary file in 0.061249732971191406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900962
After 3 runs, maximum modularity is Q = 0.902774
Louvain completed 23 runs in 1.2876784801483154 seconds
PhenoGraph complete in 2.1234192848205566 seconds
Found communities [-1, ... 21], with sizes: [119, 536, 415, 292, 216, 166, 160, 157, 156, 153, 133, 123, 104, 101, 88, 71, 60, 51, 38, 37, 25, 21, 16]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7086927890777588 seconds
Jaccard graph constructed in 0.4393768310546875 seconds
Wrote graph to binary file in 0.16389083862304688 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902961
After 2 runs, maximum modularity is Q = 0.905819
Louvain completed 22 runs in 1.2734806537628174 seconds
PhenoGraph complete in 2.608114242553711 seconds
Found communities [-1, ... 19], with sizes: [149, 583, 344, 314, 264, 252, 160, 150, 139, 132, 124, 103, 103, 93, 75, 64, 63, 51, 32, 26, 17]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3153059482574463 seconds
Jaccard graph constructed in 0.5013718605041504 seconds
Wrote graph to binary file in 0.05684971809387207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902067
After 2 runs, maximum modularity is Q = 0.903448
Louvain completed 22 runs in 1.2163736820220947 seconds
PhenoGraph complete in 2.1063852310180664 seconds
Found communities [-1, ... 23], with sizes: [97, 592, 333, 314, 242, 220, 146, 136, 110, 107, 103, 94, 92, 85, 75, 68, 67, 66, 63, 60, 52, 39, 36, 22, 19]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5093197822570801 seconds
Jaccard graph constructed in 0.5849921703338623 seconds
Wrote graph to binary file in 0.057523488998413086 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901447
After 18 runs, maximum modularity is Q = 0.902698
Louvain completed 38 runs in 1.9057786464691162 seconds
PhenoGraph complete in 3.070661783218384 seconds
Found communities [-1, ... 19], with sizes: [124, 578, 398, 235, 218, 218, 206, 166, 165, 139, 137, 126, 111, 104, 74, 64, 47, 44, 36, 32, 16]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30880260467529297 seconds
Jaccard graph constructed in 0.43804430961608887 seconds
Wrote graph to binary file in 0.1864478588104248 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901078
Louvain completed 21 runs in 1.083399772644043 seconds
PhenoGraph complete in 2.0399274826049805 seconds
Found communities [-1, ... 21], with sizes: [144, 574, 395, 313, 232, 169, 165, 158, 156, 119, 108, 101, 88, 83, 80, 80, 68, 65, 63, 36, 18, 12, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3083302974700928 seconds
Jaccard graph constructed in 0.43538928031921387 seconds
Wrote graph to binary file in 0.05936908721923828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900565
After 2 runs, maximum modularity is Q = 0.902568
Louvain completed 22 runs in 1.2561678886413574 seconds
PhenoGraph complete in 2.083144187927246 seconds
Found communities [-1, ... 22], with sizes: [144, 579, 428, 281, 203, 168, 154, 150, 132, 131, 107, 104, 94, 88, 88, 76, 73, 63, 58, 36, 29, 22, 17, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5113711357116699 seconds
Jaccard graph constructed in 0.48171424865722656 seconds
Wrote graph to binary file in 0.1784346103668213 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902065
Louvain completed 21 runs in 1.0554733276367188 seconds
PhenoGraph complete in 2.2406728267669678 seconds
Found communities [-1, ... 20], with sizes: [112, 647, 351, 275, 270, 262, 147, 136, 132, 123, 99, 98, 95, 92, 70, 69, 68, 55, 49, 36, 36, 16]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30907249450683594 seconds
Jaccard graph constructed in 0.43910741806030273 seconds
Wrote graph to binary file in 0.0630044937133789 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900294
After 13 runs, maximum modularity is Q = 0.901576
Louvain completed 33 runs in 1.7589507102966309 seconds
PhenoGraph complete in 2.5902557373046875 seconds
Found communities [-1, ... 22], with sizes: [140, 592, 316, 290, 269, 238, 204, 168, 134, 106, 106, 100, 92, 91, 71, 61, 56, 48, 39, 39, 34, 17, 15, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31041908264160156 seconds
Jaccard graph constructed in 0.5869097709655762 seconds
Wrote graph to binary file in 0.05759692192077637 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904244
Louvain completed 21 runs in 1.06337571144104 seconds
PhenoGraph complete in 2.0424818992614746 seconds
Found communities [-1, ... 19], with sizes: [116, 565, 426, 308, 235, 204, 177, 162, 159, 115, 108, 100, 99, 79, 75, 70, 69, 63, 59, 32, 17]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5087718963623047 seconds
Jaccard graph constructed in 0.4409444332122803 seconds
Wrote graph to binary file in 0.16645145416259766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902516
After 2 runs, maximum modularity is Q = 0.905016
Louvain completed 22 runs in 1.2692945003509521 seconds
PhenoGraph complete in 2.400050163269043 seconds
Found communities [-1, ... 21], with sizes: [133, 574, 392, 288, 249, 247, 168, 158, 157, 113, 111, 101, 87, 77, 73, 66, 61, 53, 37, 33, 28, 18, 14]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31087613105773926 seconds
Jaccard graph constructed in 0.47104954719543457 seconds
Wrote graph to binary file in 0.05683398246765137 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903375
Louvain completed 21 runs in 1.0595769882202148 seconds
PhenoGraph complete in 1.9135053157806396 seconds
Found communities [-1, ... 21], with sizes: [132, 645, 328, 320, 263, 161, 154, 147, 130, 123, 121, 113, 106, 90, 87, 69, 66, 59, 36, 28, 25, 20, 15]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.714332103729248 seconds
Jaccard graph constructed in 0.49078965187072754 seconds
Wrote graph to binary file in 0.17988967895507812 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902795
After 4 runs, maximum modularity is Q = 0.90413
After 15 runs, maximum modularity is Q = 0.905254
Louvain completed 35 runs in 1.9082610607147217 seconds
PhenoGraph complete in 3.309579849243164 seconds
Found communities [-1, ... 21], with sizes: [155, 619, 363, 331, 220, 198, 182, 165, 130, 112, 102, 86, 85, 85, 70, 70, 64, 52, 51, 35, 32, 16, 15]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.608839750289917 seconds
Jaccard graph constructed in 0.4503631591796875 seconds
Wrote graph to binary file in 0.06904840469360352 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901071
After 2 runs, maximum modularity is Q = 0.902185
Louvain completed 22 runs in 1.2976205348968506 seconds
PhenoGraph complete in 2.4437458515167236 seconds
Found communities [-1, ... 18], with sizes: [155, 591, 341, 302, 299, 276, 164, 150, 146, 134, 116, 98, 94, 80, 69, 68, 66, 40, 33, 16]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4082968235015869 seconds
Jaccard graph constructed in 0.4441239833831787 seconds
Wrote graph to binary file in 0.15816211700439453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899355
After 5 runs, maximum modularity is Q = 0.90109
Louvain completed 25 runs in 1.411998987197876 seconds
PhenoGraph complete in 2.4448084831237793 seconds
Found communities [-1, ... 20], with sizes: [134, 594, 391, 281, 230, 176, 158, 154, 147, 134, 127, 125, 103, 99, 90, 70, 68, 54, 37, 34, 17, 15]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5099594593048096 seconds
Jaccard graph constructed in 0.5047998428344727 seconds
Wrote graph to binary file in 0.18423247337341309 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905307
Louvain completed 21 runs in 1.0799944400787354 seconds
PhenoGraph complete in 2.2988812923431396 seconds
Found communities [-1, ... 24], with sizes: [101, 602, 425, 282, 181, 172, 161, 145, 142, 127, 105, 102, 87, 86, 81, 69, 67, 63, 45, 43, 37, 33, 26, 25, 17, 14]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5108683109283447 seconds
Jaccard graph constructed in 0.4646470546722412 seconds
Wrote graph to binary file in 0.05773448944091797 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901732
Louvain completed 21 runs in 1.0860049724578857 seconds
PhenoGraph complete in 2.150665283203125 seconds
Found communities [-1, ... 21], with sizes: [115, 595, 336, 293, 242, 181, 154, 150, 134, 133, 128, 110, 106, 98, 92, 87, 69, 63, 41, 41, 37, 17, 16]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5178375244140625 seconds
Jaccard graph constructed in 0.4842367172241211 seconds
Wrote graph to binary file in 0.18054437637329102 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900731
Louvain completed 21 runs in 1.0764310359954834 seconds
PhenoGraph complete in 2.2772862911224365 seconds
Found communities [-1, ... 22], with sizes: [122, 608, 363, 279, 250, 238, 155, 135, 125, 124, 105, 104, 90, 87, 75, 69, 68, 68, 54, 36, 32, 18, 18, 15]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5122959613800049 seconds
Jaccard graph constructed in 0.43636131286621094 seconds
Wrote graph to binary file in 0.05808568000793457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902472
After 3 runs, maximum modularity is Q = 0.903498
Louvain completed 23 runs in 1.2720496654510498 seconds
PhenoGraph complete in 2.3114070892333984 seconds
Found communities [-1, ... 22], with sizes: [112, 564, 298, 281, 271, 206, 184, 165, 150, 142, 113, 100, 99, 99, 96, 71, 60, 57, 44, 38, 33, 25, 17, 13]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5092461109161377 seconds
Jaccard graph constructed in 0.4480714797973633 seconds
Wrote graph to binary file in 0.16853785514831543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903368
Louvain completed 21 runs in 1.1005442142486572 seconds
PhenoGraph complete in 2.2482662200927734 seconds
Found communities [-1, ... 21], with sizes: [112, 556, 332, 318, 249, 237, 223, 164, 159, 118, 108, 101, 89, 72, 70, 63, 61, 57, 46, 33, 32, 20, 18]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5079901218414307 seconds
Jaccard graph constructed in 0.43482112884521484 seconds
Wrote graph to binary file in 0.16631746292114258 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90279
After 2 runs, maximum modularity is Q = 0.904373
Louvain completed 22 runs in 1.2382500171661377 seconds
PhenoGraph complete in 2.3846590518951416 seconds
Found communities [-1, ... 21], with sizes: [145, 595, 409, 327, 202, 202, 200, 135, 135, 132, 108, 99, 99, 83, 79, 70, 62, 39, 35, 31, 23, 16, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5132453441619873 seconds
Jaccard graph constructed in 0.45227956771850586 seconds
Wrote graph to binary file in 0.05643153190612793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899397
After 2 runs, maximum modularity is Q = 0.901089
Louvain completed 22 runs in 1.2974295616149902 seconds
PhenoGraph complete in 2.3314359188079834 seconds
Found communities [-1, ... 22], with sizes: [119, 630, 401, 307, 254, 179, 163, 162, 132, 125, 109, 106, 93, 89, 75, 63, 56, 42, 36, 30, 20, 18, 17, 12]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5112402439117432 seconds
Jaccard graph constructed in 0.43253445625305176 seconds
Wrote graph to binary file in 0.20474982261657715 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903668
Louvain completed 21 runs in 1.0554800033569336 seconds
PhenoGraph complete in 2.235367774963379 seconds
Found communities [-1, ... 20], with sizes: [114, 590, 353, 314, 285, 248, 163, 162, 138, 114, 103, 92, 85, 83, 69, 68, 67, 59, 49, 40, 25, 17]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5084431171417236 seconds
Jaccard graph constructed in 0.4371986389160156 seconds
Wrote graph to binary file in 0.06174921989440918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899561
After 5 runs, maximum modularity is Q = 0.901489
Louvain completed 25 runs in 1.4370558261871338 seconds
PhenoGraph complete in 2.460622549057007 seconds
Found communities [-1, ... 22], with sizes: [133, 533, 316, 307, 289, 190, 183, 149, 137, 136, 107, 105, 104, 94, 73, 71, 61, 55, 50, 48, 32, 30, 19, 16]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5137088298797607 seconds
Jaccard graph constructed in 0.45734453201293945 seconds
Wrote graph to binary file in 0.1864638328552246 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902882
Louvain completed 21 runs in 1.0718755722045898 seconds
PhenoGraph complete in 2.2464241981506348 seconds
Found communities [-1, ... 22], with sizes: [153, 569, 319, 277, 254, 209, 174, 172, 134, 133, 114, 104, 91, 89, 81, 70, 66, 66, 38, 33, 33, 32, 16, 11]

In [60]:
sc.pp.normalize_per_cell(D367_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Nas1) # log transform the data
D367_Brus_Nas1.raw = D367_Brus_Nas1 # freeze the object (for later use of the raw state of it)
In [61]:
D367_Brus_Nas1 = D367_Brus_Nas1[:, D367_Brus_Nas1.var['ribo_genes']]
D367_Brus_Nas1
Out[61]:
View of AnnData object with n_obs × n_vars = 2591 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [62]:
D372_Brus_Nas1 = sc.read_10x_mtx(
    './D372_Brus_Nas1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D372_Brus_Nas1.var_names_make_unique()
D372_Brus_Nas1.obs['manip'] = 'D372_Brus_Nas1'
D372_Brus_Nas1.obs['position'] = 'Nasal'
D372_Brus_Nas1.obs['method'] = 'Brushing'
D372_Brus_Nas1.obs['donor'] = 'D372'
D372_Brus_Nas1.obs['name'] = ['D372_Brus_Nas1_' + s for s in list(D372_Brus_Nas1.obs.index)]
D372_Brus_Nas1.obs_names = D372_Brus_Nas1.obs['name']
D372_Brus_Nas1
... reading from cache file ./cache/D372_Brus_Nas1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[62]:
AnnData object with n_obs × n_vars = 2336 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [63]:
sc.pl.highest_expr_genes(D372_Brus_Nas1, n_top=20)
In [64]:
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=0)
mito_genes = D372_Brus_Nas1.var_names.str.startswith('MT-')
D372_Brus_Nas1.obs['percent_mito'] = np.sum(
    D372_Brus_Nas1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.obs['n_counts'] = D372_Brus_Nas1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Nas1.to_df())
ribo_genes = D372_Brus_Nas1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Nas1.obs['percent_ribo'] = np.sum(
    D372_Brus_Nas1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Nas1.X, axis=1).A1
D372_Brus_Nas1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D372_Brus_Nas1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [65]:
sc.pp.filter_cells(D372_Brus_Nas1, min_genes=500)
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['n_counts'] < 40000, :]
D372_Brus_Nas1 = D372_Brus_Nas1[D372_Brus_Nas1.obs['percent_mito'] < 0.5, :]
In [66]:
# scrublet
scrub = scr.Scrublet(D372_Brus_Nas1.X, expected_doublet_rate=0.019)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Brus_Nas1.obs['doublet_scores'] = doublet_scores
D372_Brus_Nas1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.22
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 15.0%
Overall doublet rate:
	Expected   = 1.9%
	Estimated  = 1.7%
Elapsed time: 2.1 seconds
Out[66]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eca849a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecba29fd0>],
       dtype=object))
In [67]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Brus_Nas1.X).predict()
D372_Brus_Nas1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4104349613189697 seconds
Jaccard graph constructed in 0.48456406593322754 seconds
Wrote graph to binary file in 0.05161619186401367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902756
Louvain completed 21 runs in 1.038480281829834 seconds
PhenoGraph complete in 1.9975333213806152 seconds
Found communities [-1, ... 21], with sizes: [93, 457, 316, 305, 289, 288, 164, 146, 128, 114, 95, 67, 66, 65, 56, 50, 49, 35, 34, 34, 22, 21, 17]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4088413715362549 seconds
Jaccard graph constructed in 0.4397096633911133 seconds
Wrote graph to binary file in 0.16575980186462402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905068
Louvain completed 21 runs in 1.0620262622833252 seconds
PhenoGraph complete in 2.0957629680633545 seconds
Found communities [-1, ... 20], with sizes: [91, 418, 315, 304, 291, 252, 182, 174, 173, 141, 74, 73, 72, 68, 64, 53, 39, 38, 36, 22, 19, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41448283195495605 seconds
Jaccard graph constructed in 0.4419240951538086 seconds
Wrote graph to binary file in 0.054010629653930664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904481
Louvain completed 21 runs in 1.05592942237854 seconds
PhenoGraph complete in 1.9787116050720215 seconds
Found communities [-1, ... 21], with sizes: [100, 403, 329, 288, 245, 217, 211, 185, 152, 121, 110, 80, 74, 69, 68, 56, 47, 39, 35, 29, 22, 18, 13]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4087090492248535 seconds
Jaccard graph constructed in 0.43219971656799316 seconds
Wrote graph to binary file in 0.18044114112854004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899057
Louvain completed 21 runs in 1.039236307144165 seconds
PhenoGraph complete in 2.0733461380004883 seconds
Found communities [-1, ... 20], with sizes: [80, 406, 341, 320, 292, 236, 176, 153, 152, 139, 90, 76, 75, 71, 55, 52, 51, 44, 34, 29, 23, 16]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4097177982330322 seconds
Jaccard graph constructed in 0.4422883987426758 seconds
Wrote graph to binary file in 0.05698728561401367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900724
Louvain completed 21 runs in 1.0721895694732666 seconds
PhenoGraph complete in 2.001919984817505 seconds
Found communities [-1, ... 23], with sizes: [88, 370, 309, 222, 195, 195, 193, 171, 146, 139, 125, 114, 88, 80, 78, 73, 64, 55, 47, 44, 34, 33, 22, 13, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41097593307495117 seconds
Jaccard graph constructed in 0.5679218769073486 seconds
Wrote graph to binary file in 0.05442643165588379 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900366
After 4 runs, maximum modularity is Q = 0.901533
Louvain completed 24 runs in 1.3656713962554932 seconds
PhenoGraph complete in 2.410922050476074 seconds
Found communities [-1, ... 22], with sizes: [95, 396, 333, 298, 253, 250, 176, 144, 120, 89, 85, 85, 77, 74, 71, 66, 56, 52, 42, 42, 34, 34, 22, 17]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5100057125091553 seconds
Jaccard graph constructed in 0.4215846061706543 seconds
Wrote graph to binary file in 0.1642932891845703 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902812
Louvain completed 21 runs in 1.056971788406372 seconds
PhenoGraph complete in 2.165310859680176 seconds
Found communities [-1, ... 21], with sizes: [97, 395, 372, 269, 255, 251, 242, 165, 124, 122, 87, 82, 80, 63, 62, 50, 37, 36, 34, 33, 22, 20, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4157731533050537 seconds
Jaccard graph constructed in 0.44240331649780273 seconds
Wrote graph to binary file in 0.05346488952636719 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904502
Louvain completed 21 runs in 1.0716676712036133 seconds
PhenoGraph complete in 1.9949562549591064 seconds
Found communities [-1, ... 21], with sizes: [74, 398, 330, 280, 277, 257, 197, 179, 172, 144, 81, 76, 74, 70, 59, 50, 42, 37, 34, 29, 23, 16, 12]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4089932441711426 seconds
Jaccard graph constructed in 0.4276108741760254 seconds
Wrote graph to binary file in 0.18631744384765625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899931
After 4 runs, maximum modularity is Q = 0.901282
Louvain completed 24 runs in 1.332381010055542 seconds
PhenoGraph complete in 2.370694398880005 seconds
Found communities [-1, ... 23], with sizes: [85, 382, 245, 243, 237, 188, 183, 156, 147, 123, 122, 117, 96, 80, 76, 72, 66, 56, 51, 48, 35, 34, 33, 22, 14]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40883970260620117 seconds
Jaccard graph constructed in 0.4815654754638672 seconds
Wrote graph to binary file in 0.05580449104309082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905739
Louvain completed 21 runs in 1.0528161525726318 seconds
PhenoGraph complete in 2.0144546031951904 seconds
Found communities [-1, ... 20], with sizes: [89, 407, 327, 298, 245, 221, 187, 168, 150, 142, 111, 82, 77, 73, 72, 55, 54, 41, 37, 35, 22, 18]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4092233180999756 seconds
Jaccard graph constructed in 0.43491411209106445 seconds
Wrote graph to binary file in 0.1816234588623047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902347
Louvain completed 21 runs in 1.0661087036132812 seconds
PhenoGraph complete in 2.1032302379608154 seconds
Found communities [-1, ... 21], with sizes: [85, 413, 385, 294, 292, 275, 195, 180, 159, 81, 76, 68, 68, 53, 52, 43, 41, 35, 34, 29, 22, 19, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4102473258972168 seconds
Jaccard graph constructed in 0.4155423641204834 seconds
Wrote graph to binary file in 0.056139469146728516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90426
Louvain completed 21 runs in 1.0423285961151123 seconds
PhenoGraph complete in 1.9398508071899414 seconds
Found communities [-1, ... 22], with sizes: [94, 426, 366, 282, 246, 211, 206, 191, 144, 120, 84, 74, 73, 72, 65, 46, 40, 38, 36, 30, 22, 18, 16, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4085385799407959 seconds
Jaccard graph constructed in 0.4093973636627197 seconds
Wrote graph to binary file in 0.16074681282043457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902218
Louvain completed 21 runs in 1.0713722705841064 seconds
PhenoGraph complete in 2.0610687732696533 seconds
Found communities [-1, ... 22], with sizes: [94, 425, 304, 281, 273, 236, 182, 171, 150, 142, 112, 73, 68, 67, 65, 53, 42, 35, 35, 34, 22, 18, 15, 14]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4122939109802246 seconds
Jaccard graph constructed in 0.43650269508361816 seconds
Wrote graph to binary file in 0.058606863021850586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900811
After 5 runs, maximum modularity is Q = 0.901957
Louvain completed 25 runs in 1.380363941192627 seconds
PhenoGraph complete in 2.306610107421875 seconds
Found communities [-1, ... 21], with sizes: [107, 413, 315, 298, 296, 269, 221, 184, 117, 96, 86, 76, 64, 62, 54, 49, 48, 38, 34, 34, 22, 15, 13]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40839338302612305 seconds
Jaccard graph constructed in 0.4399690628051758 seconds
Wrote graph to binary file in 0.18412351608276367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900966
After 2 runs, maximum modularity is Q = 0.902166
Louvain completed 22 runs in 1.2874994277954102 seconds
PhenoGraph complete in 2.334472894668579 seconds
Found communities [-1, ... 22], with sizes: [108, 417, 303, 295, 245, 231, 225, 206, 172, 106, 75, 69, 66, 55, 54, 50, 46, 44, 34, 29, 27, 22, 17, 15]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.419569730758667 seconds
Jaccard graph constructed in 0.43588876724243164 seconds
Wrote graph to binary file in 0.05357718467712402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902249
Louvain completed 21 runs in 1.0449965000152588 seconds
PhenoGraph complete in 1.974944829940796 seconds
Found communities [-1, ... 20], with sizes: [105, 415, 315, 301, 277, 274, 190, 184, 117, 98, 88, 86, 79, 72, 62, 58, 42, 40, 36, 33, 22, 17]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40757298469543457 seconds
Jaccard graph constructed in 0.4286377429962158 seconds
Wrote graph to binary file in 0.18510174751281738 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899864
After 2 runs, maximum modularity is Q = 0.901044
Louvain completed 22 runs in 1.2612056732177734 seconds
PhenoGraph complete in 2.300072193145752 seconds
Found communities [-1, ... 21], with sizes: [101, 395, 370, 277, 275, 234, 205, 151, 139, 134, 79, 78, 76, 72, 57, 56, 56, 41, 35, 28, 22, 18, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5133953094482422 seconds
Jaccard graph constructed in 0.43563127517700195 seconds
Wrote graph to binary file in 0.06734108924865723 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901219
Louvain completed 21 runs in 1.0888986587524414 seconds
PhenoGraph complete in 2.133082389831543 seconds
Found communities [-1, ... 19], with sizes: [83, 424, 353, 293, 292, 284, 202, 186, 154, 82, 78, 74, 71, 68, 64, 51, 47, 35, 31, 22, 17]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.414459228515625 seconds
Jaccard graph constructed in 0.44033050537109375 seconds
Wrote graph to binary file in 0.17866730690002441 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899097
After 2 runs, maximum modularity is Q = 0.900133
Louvain completed 22 runs in 1.2436597347259521 seconds
PhenoGraph complete in 2.2888524532318115 seconds
Found communities [-1, ... 23], with sizes: [108, 313, 307, 284, 259, 253, 189, 163, 126, 112, 109, 92, 78, 75, 70, 69, 65, 47, 39, 37, 34, 34, 22, 15, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4095485210418701 seconds
Jaccard graph constructed in 0.49824070930480957 seconds
Wrote graph to binary file in 0.05281829833984375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902287
Louvain completed 21 runs in 1.0733473300933838 seconds
PhenoGraph complete in 2.0504233837127686 seconds
Found communities [-1, ... 20], with sizes: [89, 423, 307, 302, 211, 205, 202, 175, 155, 136, 114, 88, 81, 80, 72, 67, 52, 45, 34, 33, 22, 18]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40773892402648926 seconds
Jaccard graph constructed in 0.44121623039245605 seconds
Wrote graph to binary file in 0.18178534507751465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902912
Louvain completed 21 runs in 1.0678446292877197 seconds
PhenoGraph complete in 2.1116764545440674 seconds
Found communities [-1, ... 20], with sizes: [86, 364, 358, 287, 272, 261, 240, 192, 146, 132, 87, 86, 76, 73, 70, 43, 34, 30, 27, 22, 14, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40897274017333984 seconds
Jaccard graph constructed in 0.43024516105651855 seconds
Wrote graph to binary file in 0.05281400680541992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907184
Louvain completed 21 runs in 1.0659265518188477 seconds
PhenoGraph complete in 1.9704034328460693 seconds
Found communities [-1, ... 22], with sizes: [121, 386, 314, 290, 281, 253, 220, 161, 141, 89, 78, 76, 70, 66, 64, 57, 47, 44, 39, 34, 28, 22, 17, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40806055068969727 seconds
Jaccard graph constructed in 0.43677735328674316 seconds
Wrote graph to binary file in 0.15897130966186523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902582
After 13 runs, maximum modularity is Q = 0.904015
Louvain completed 33 runs in 1.7000999450683594 seconds
PhenoGraph complete in 2.715348720550537 seconds
Found communities [-1, ... 21], with sizes: [108, 358, 318, 311, 289, 242, 190, 165, 141, 117, 96, 79, 75, 70, 65, 58, 47, 44, 36, 32, 31, 22, 17]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41316866874694824 seconds
Jaccard graph constructed in 0.4379744529724121 seconds
Wrote graph to binary file in 0.18474483489990234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899323
After 2 runs, maximum modularity is Q = 0.900924
Louvain completed 22 runs in 1.2787697315216064 seconds
PhenoGraph complete in 2.327200174331665 seconds
Found communities [-1, ... 22], with sizes: [100, 385, 313, 256, 242, 182, 163, 152, 151, 151, 151, 90, 86, 72, 68, 64, 55, 50, 41, 35, 33, 32, 23, 16]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.408933162689209 seconds
Jaccard graph constructed in 0.42848753929138184 seconds
Wrote graph to binary file in 0.050803184509277344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899611
Louvain completed 21 runs in 1.0532317161560059 seconds
PhenoGraph complete in 1.952439546585083 seconds
Found communities [-1, ... 21], with sizes: [103, 410, 372, 254, 229, 225, 199, 159, 137, 135, 106, 79, 79, 78, 70, 54, 49, 37, 37, 35, 27, 23, 14]

In [68]:
sc.pp.normalize_per_cell(D372_Brus_Nas1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Nas1) # log transform the data
D372_Brus_Nas1.raw = D372_Brus_Nas1 # freeze the object (for later use of the raw state of it)
In [69]:
D372_Brus_Nas1 = D372_Brus_Nas1[:, D372_Brus_Nas1.var['ribo_genes']]
D372_Brus_Nas1
Out[69]:
View of AnnData object with n_obs × n_vars = 2329 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

Proximal Biopsies

Back to top

In [70]:
D322_Biop_Pro1 = sc.read_10x_mtx(
    './D322_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D322_Biop_Pro1.var_names_make_unique()
D322_Biop_Pro1.obs['manip'] = 'D322_Biop_Pro1'
D322_Biop_Pro1.obs['position'] = 'Proximal'
D322_Biop_Pro1.obs['method'] = 'Biopsy'
D322_Biop_Pro1.obs['donor'] = 'D322'
D322_Biop_Pro1.obs['name'] = ['D322_Biop_Pro1_' + s for s in list(D322_Biop_Pro1.obs.index)]
D322_Biop_Pro1.obs_names = D322_Biop_Pro1.obs['name']
D322_Biop_Pro1
... reading from cache file ./cache/D322_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[70]:
AnnData object with n_obs × n_vars = 2035 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [71]:
sc.pl.highest_expr_genes(D322_Biop_Pro1, n_top=20)
In [72]:
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=0)
mito_genes = D322_Biop_Pro1.var_names.str.startswith('MT-')
D322_Biop_Pro1.obs['percent_mito'] = np.sum(
    D322_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.obs['n_counts'] = D322_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Pro1.to_df())
ribo_genes = D322_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D322_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Pro1.X, axis=1).A1
D322_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D322_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [73]:
sc.pp.filter_cells(D322_Biop_Pro1, min_genes=500)
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['n_counts'] < 20000, :]
D322_Biop_Pro1 = D322_Biop_Pro1[D322_Biop_Pro1.obs['percent_mito'] < 0.3, :]
filtered out 60 cells that have less than 500 genes expressed
In [74]:
# scrublet
scrub = scr.Scrublet(D322_Biop_Pro1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D322_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.18
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 9.6%
Overall doublet rate:
	Expected   = 1.6%
	Estimated  = 4.3%
Elapsed time: 1.0 seconds
Out[74]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfe203c8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfaf2828>],
       dtype=object))
In [75]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Pro1.X).predict()
D322_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6393563747406006 seconds
Jaccard graph constructed in 0.4113748073577881 seconds
Wrote graph to binary file in 0.0392765998840332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900925
Louvain completed 21 runs in 1.0455265045166016 seconds
PhenoGraph complete in 2.1460633277893066 seconds
Found communities [-1, ... 18], with sizes: [140, 811, 395, 198, 155, 147, 92, 74, 68, 53, 48, 42, 38, 38, 35, 32, 28, 24, 22, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4117100238800049 seconds
Jaccard graph constructed in 0.410874605178833 seconds
Wrote graph to binary file in 0.03756237030029297 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899296
After 3 runs, maximum modularity is Q = 0.900804
Louvain completed 23 runs in 1.3342885971069336 seconds
PhenoGraph complete in 2.2148613929748535 seconds
Found communities [-1, ... 19], with sizes: [205, 769, 398, 178, 142, 140, 76, 74, 74, 69, 47, 42, 39, 35, 35, 28, 25, 25, 24, 14, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40962743759155273 seconds
Jaccard graph constructed in 0.4233078956604004 seconds
Wrote graph to binary file in 0.17347097396850586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900464
Louvain completed 21 runs in 1.0541834831237793 seconds
PhenoGraph complete in 2.0714707374572754 seconds
Found communities [-1, ... 18], with sizes: [163, 778, 371, 217, 165, 160, 100, 77, 72, 63, 42, 42, 42, 34, 27, 26, 22, 21, 17, 13]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4098362922668457 seconds
Jaccard graph constructed in 0.4011504650115967 seconds
Wrote graph to binary file in 0.03797626495361328 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899684
Louvain completed 21 runs in 1.0307955741882324 seconds
PhenoGraph complete in 1.8917357921600342 seconds
Found communities [-1, ... 18], with sizes: [192, 703, 370, 237, 202, 136, 96, 83, 80, 55, 44, 42, 33, 33, 33, 27, 25, 25, 20, 16]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4088118076324463 seconds
Jaccard graph constructed in 0.45768094062805176 seconds
Wrote graph to binary file in 0.039757728576660156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901873
Louvain completed 21 runs in 1.0425488948822021 seconds
PhenoGraph complete in 1.9624834060668945 seconds
Found communities [-1, ... 18], with sizes: [185, 764, 398, 199, 165, 139, 91, 82, 68, 56, 43, 41, 40, 37, 33, 33, 25, 22, 17, 14]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40851902961730957 seconds
Jaccard graph constructed in 0.39827442169189453 seconds
Wrote graph to binary file in 0.1715395450592041 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896473
After 2 runs, maximum modularity is Q = 0.897808
Louvain completed 22 runs in 1.255896806716919 seconds
PhenoGraph complete in 2.24523663520813 seconds
Found communities [-1, ... 18], with sizes: [134, 818, 349, 193, 192, 141, 98, 89, 74, 64, 47, 43, 39, 35, 29, 27, 25, 22, 21, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.408618688583374 seconds
Jaccard graph constructed in 0.4059271812438965 seconds
Wrote graph to binary file in 0.04202532768249512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897849
After 2 runs, maximum modularity is Q = 0.900395
Louvain completed 22 runs in 1.311697006225586 seconds
PhenoGraph complete in 2.18277645111084 seconds
Found communities [-1, ... 19], with sizes: [139, 797, 390, 181, 176, 173, 80, 72, 71, 67, 44, 43, 38, 35, 30, 26, 23, 22, 17, 15, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40802645683288574 seconds
Jaccard graph constructed in 0.4162874221801758 seconds
Wrote graph to binary file in 0.03611612319946289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902588
Louvain completed 21 runs in 1.050546407699585 seconds
PhenoGraph complete in 1.9213097095489502 seconds
Found communities [-1, ... 19], with sizes: [201, 791, 289, 282, 184, 141, 91, 67, 67, 58, 38, 36, 35, 31, 30, 26, 25, 21, 14, 14, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40918421745300293 seconds
Jaccard graph constructed in 0.40732789039611816 seconds
Wrote graph to binary file in 0.1727921962738037 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898019
Louvain completed 21 runs in 1.068749189376831 seconds
PhenoGraph complete in 2.0713589191436768 seconds
Found communities [-1, ... 19], with sizes: [171, 794, 396, 168, 164, 136, 96, 80, 69, 63, 54, 45, 36, 33, 30, 26, 25, 22, 20, 13, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4082217216491699 seconds
Jaccard graph constructed in 0.39855480194091797 seconds
Wrote graph to binary file in 0.03763461112976074 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898898
Louvain completed 21 runs in 1.0382418632507324 seconds
PhenoGraph complete in 1.8954129219055176 seconds
Found communities [-1, ... 19], with sizes: [170, 755, 296, 217, 178, 138, 92, 90, 76, 67, 66, 52, 46, 35, 33, 32, 31, 29, 21, 16, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4072906970977783 seconds
Jaccard graph constructed in 0.406890869140625 seconds
Wrote graph to binary file in 0.03890848159790039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89867
After 5 runs, maximum modularity is Q = 0.899724
Louvain completed 25 runs in 1.3519103527069092 seconds
PhenoGraph complete in 2.2166450023651123 seconds
Found communities [-1, ... 18], with sizes: [152, 770, 398, 190, 179, 140, 76, 71, 69, 65, 59, 46, 44, 42, 37, 35, 27, 23, 16, 13]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4082176685333252 seconds
Jaccard graph constructed in 0.42229700088500977 seconds
Wrote graph to binary file in 0.15726685523986816 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902191
Louvain completed 21 runs in 1.0502548217773438 seconds
PhenoGraph complete in 2.049294948577881 seconds
Found communities [-1, ... 18], with sizes: [165, 758, 363, 240, 180, 130, 75, 74, 69, 55, 54, 46, 44, 41, 38, 27, 27, 25, 22, 19]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4109930992126465 seconds
Jaccard graph constructed in 0.41216397285461426 seconds
Wrote graph to binary file in 0.037985801696777344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900734
Louvain completed 21 runs in 1.0605406761169434 seconds
PhenoGraph complete in 1.9386405944824219 seconds
Found communities [-1, ... 18], with sizes: [180, 740, 298, 264, 196, 143, 90, 81, 81, 61, 53, 50, 41, 35, 33, 32, 27, 17, 16, 14]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40829038619995117 seconds
Jaccard graph constructed in 0.42495107650756836 seconds
Wrote graph to binary file in 0.037286996841430664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897438
Louvain completed 21 runs in 1.117274522781372 seconds
PhenoGraph complete in 1.999619960784912 seconds
Found communities [-1, ... 17], with sizes: [183, 790, 379, 180, 173, 162, 83, 76, 72, 72, 55, 46, 44, 32, 32, 24, 24, 14, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41057324409484863 seconds
Jaccard graph constructed in 0.4098360538482666 seconds
Wrote graph to binary file in 0.1486949920654297 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898678
After 3 runs, maximum modularity is Q = 0.899945
Louvain completed 23 runs in 1.284012794494629 seconds
PhenoGraph complete in 2.2636094093322754 seconds
Found communities [-1, ... 20], with sizes: [189, 762, 392, 196, 169, 132, 89, 84, 66, 63, 46, 35, 33, 30, 29, 27, 25, 23, 18, 17, 14, 13]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4084312915802002 seconds
Jaccard graph constructed in 0.4162874221801758 seconds
Wrote graph to binary file in 0.03622627258300781 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900493
Louvain completed 21 runs in 1.05033540725708 seconds
PhenoGraph complete in 1.923551082611084 seconds
Found communities [-1, ... 20], with sizes: [171, 791, 352, 196, 166, 140, 88, 72, 68, 62, 56, 42, 36, 34, 29, 28, 25, 23, 22, 18, 18, 15]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40816807746887207 seconds
Jaccard graph constructed in 0.4208359718322754 seconds
Wrote graph to binary file in 0.03706812858581543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90044
After 3 runs, maximum modularity is Q = 0.901495
Louvain completed 23 runs in 1.2756478786468506 seconds
PhenoGraph complete in 2.1544413566589355 seconds
Found communities [-1, ... 19], with sizes: [149, 812, 405, 179, 160, 139, 93, 81, 69, 54, 46, 42, 39, 34, 33, 29, 23, 21, 17, 15, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40907955169677734 seconds
Jaccard graph constructed in 0.411531925201416 seconds
Wrote graph to binary file in 0.15400314331054688 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90043
Louvain completed 21 runs in 1.0795376300811768 seconds
PhenoGraph complete in 2.066520929336548 seconds
Found communities [-1, ... 20], with sizes: [195, 779, 395, 168, 136, 132, 83, 82, 78, 69, 45, 41, 36, 35, 31, 31, 30, 24, 18, 16, 14, 14]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40826964378356934 seconds
Jaccard graph constructed in 0.41058945655822754 seconds
Wrote graph to binary file in 0.03816676139831543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89756
Louvain completed 21 runs in 1.0991933345794678 seconds
PhenoGraph complete in 1.9706928730010986 seconds
Found communities [-1, ... 18], with sizes: [145, 802, 385, 193, 182, 135, 80, 80, 68, 68, 54, 46, 37, 35, 32, 31, 26, 23, 18, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40864086151123047 seconds
Jaccard graph constructed in 0.41047143936157227 seconds
Wrote graph to binary file in 0.036835670471191406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899154
Louvain completed 21 runs in 1.0300240516662598 seconds
PhenoGraph complete in 1.898136854171753 seconds
Found communities [-1, ... 18], with sizes: [146, 772, 404, 197, 167, 142, 94, 81, 57, 57, 49, 44, 42, 37, 36, 33, 33, 28, 22, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40981054306030273 seconds
Jaccard graph constructed in 0.4129366874694824 seconds
Wrote graph to binary file in 0.15970945358276367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897327
After 2 runs, maximum modularity is Q = 0.89844
Louvain completed 22 runs in 1.269575595855713 seconds
PhenoGraph complete in 2.264087438583374 seconds
Found communities [-1, ... 19], with sizes: [169, 725, 391, 234, 197, 134, 78, 67, 66, 52, 52, 42, 38, 38, 35, 30, 28, 23, 23, 18, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4084758758544922 seconds
Jaccard graph constructed in 0.4285869598388672 seconds
Wrote graph to binary file in 0.037267446517944336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895792
Louvain completed 21 runs in 1.0494191646575928 seconds
PhenoGraph complete in 1.937840223312378 seconds
Found communities [-1, ... 19], with sizes: [170, 753, 372, 196, 184, 140, 99, 88, 77, 68, 38, 38, 37, 36, 26, 26, 23, 23, 23, 22, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4080162048339844 seconds
Jaccard graph constructed in 0.41349220275878906 seconds
Wrote graph to binary file in 0.03620028495788574 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898052
Louvain completed 21 runs in 1.0701885223388672 seconds
PhenoGraph complete in 1.9381113052368164 seconds
Found communities [-1, ... 20], with sizes: [182, 800, 303, 198, 155, 140, 93, 69, 69, 67, 63, 46, 37, 36, 34, 34, 33, 27, 22, 17, 15, 12]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40899109840393066 seconds
Jaccard graph constructed in 0.3948996067047119 seconds
Wrote graph to binary file in 0.16979265213012695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896895
Louvain completed 21 runs in 1.024454116821289 seconds
PhenoGraph complete in 2.008869171142578 seconds
Found communities [-1, ... 16], with sizes: [173, 754, 344, 222, 190, 160, 82, 74, 72, 70, 61, 57, 44, 42, 36, 32, 23, 16]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40865564346313477 seconds
Jaccard graph constructed in 0.4541025161743164 seconds
Wrote graph to binary file in 0.04162740707397461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900106
Louvain completed 21 runs in 1.0664453506469727 seconds
PhenoGraph complete in 1.9894070625305176 seconds
Found communities [-1, ... 20], with sizes: [142, 797, 368, 196, 185, 137, 84, 69, 68, 64, 46, 41, 37, 34, 34, 32, 26, 24, 23, 18, 16, 11]

In [76]:
sc.pp.normalize_per_cell(D322_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Pro1) # log transform the data
D322_Biop_Pro1.raw = D322_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [77]:
D322_Biop_Pro1 = D322_Biop_Pro1[:, D322_Biop_Pro1.var['ribo_genes']]
D322_Biop_Pro1
Out[77]:
View of AnnData object with n_obs × n_vars = 1962 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [78]:
D326_Biop_Pro1 = sc.read_10x_mtx(
    './D326_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D326_Biop_Pro1.var_names_make_unique()
D326_Biop_Pro1.obs['manip'] = 'D326_Biop_Pro1'
D326_Biop_Pro1.obs['position'] = 'Proximal'
D326_Biop_Pro1.obs['method'] = 'Biopsy'
D326_Biop_Pro1.obs['donor'] = 'D326'
D326_Biop_Pro1.obs['name'] = ['D326_Biop_Pro1_' + s for s in list(D326_Biop_Pro1.obs.index)]
D326_Biop_Pro1.obs_names = D326_Biop_Pro1.obs['name']
D326_Biop_Pro1
... reading from cache file ./cache/D326_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[78]:
AnnData object with n_obs × n_vars = 2941 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [79]:
sc.pl.highest_expr_genes(D326_Biop_Pro1, n_top=20)
In [80]:
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=0)
mito_genes = D326_Biop_Pro1.var_names.str.startswith('MT-')
D326_Biop_Pro1.obs['percent_mito'] = np.sum(
    D326_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.obs['n_counts'] = D326_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Pro1.to_df())
ribo_genes = D326_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D326_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Pro1.X, axis=1).A1
D326_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D326_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [81]:
sc.pp.filter_cells(D326_Biop_Pro1, min_genes=500)
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['n_counts'] < 40000, :]
D326_Biop_Pro1 = D326_Biop_Pro1[D326_Biop_Pro1.obs['percent_mito'] < 0.5, :]
filtered out 12 cells that have less than 500 genes expressed
In [82]:
# scrublet
scrub = scr.Scrublet(D326_Biop_Pro1.X, expected_doublet_rate=0.023)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D326_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.27
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 22.9%
Overall doublet rate:
	Expected   = 2.3%
	Estimated  = 1.9%
Elapsed time: 2.3 seconds
Out[82]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eca851ba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebf9879b0>],
       dtype=object))
In [83]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Biop_Pro1.X).predict()
D326_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.508995771408081 seconds
Jaccard graph constructed in 0.6974852085113525 seconds
Wrote graph to binary file in 0.05855703353881836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91892
Louvain completed 21 runs in 1.257519006729126 seconds
PhenoGraph complete in 2.546231269836426 seconds
Found communities [-1, ... 23], with sizes: [242, 1016, 445, 243, 205, 192, 166, 152, 133, 121, 115, 84, 79, 73, 67, 65, 46, 45, 43, 26, 22, 22, 16, 15, 14]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6098105907440186 seconds
Jaccard graph constructed in 0.539586067199707 seconds
Wrote graph to binary file in 0.2025914192199707 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919851
Louvain completed 21 runs in 1.2839746475219727 seconds
PhenoGraph complete in 2.658162832260132 seconds
Found communities [-1, ... 27], with sizes: [236, 888, 367, 260, 205, 192, 192, 169, 169, 103, 87, 83, 83, 78, 73, 61, 57, 46, 44, 40, 40, 32, 29, 28, 21, 21, 16, 14, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6093878746032715 seconds
Jaccard graph constructed in 0.5530076026916504 seconds
Wrote graph to binary file in 0.059752702713012695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917163
Louvain completed 21 runs in 1.249143123626709 seconds
PhenoGraph complete in 2.4859230518341064 seconds
Found communities [-1, ... 24], with sizes: [233, 1118, 396, 230, 222, 175, 147, 140, 126, 122, 120, 86, 73, 68, 65, 56, 46, 43, 42, 29, 27, 23, 17, 15, 14, 14]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5099210739135742 seconds
Jaccard graph constructed in 0.5796449184417725 seconds
Wrote graph to binary file in 0.1806011199951172 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917036
Louvain completed 21 runs in 1.261793613433838 seconds
PhenoGraph complete in 2.54829740524292 seconds
Found communities [-1, ... 23], with sizes: [233, 1133, 357, 233, 210, 188, 166, 159, 118, 116, 85, 83, 77, 71, 69, 66, 50, 44, 42, 41, 35, 23, 22, 13, 13]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5111558437347412 seconds
Jaccard graph constructed in 0.6220264434814453 seconds
Wrote graph to binary file in 0.06535983085632324 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917812
After 2 runs, maximum modularity is Q = 0.918871
Louvain completed 22 runs in 1.5968191623687744 seconds
PhenoGraph complete in 2.814903974533081 seconds
Found communities [-1, ... 26], with sizes: [233, 1160, 378, 215, 180, 173, 165, 147, 114, 87, 84, 79, 74, 72, 66, 62, 53, 47, 46, 45, 30, 26, 25, 25, 24, 13, 12, 12]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6108925342559814 seconds
Jaccard graph constructed in 0.6344013214111328 seconds
Wrote graph to binary file in 0.21292638778686523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919472
Louvain completed 21 runs in 1.2828552722930908 seconds
PhenoGraph complete in 2.7552881240844727 seconds
Found communities [-1, ... 24], with sizes: [243, 1152, 424, 233, 169, 155, 147, 142, 127, 90, 86, 83, 74, 70, 63, 61, 50, 47, 41, 41, 33, 31, 27, 23, 19, 16]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.612246036529541 seconds
Jaccard graph constructed in 0.6014208793640137 seconds
Wrote graph to binary file in 0.0702052116394043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918672
Louvain completed 21 runs in 1.3591969013214111 seconds
PhenoGraph complete in 2.6636412143707275 seconds
Found communities [-1, ... 22], with sizes: [176, 1206, 387, 240, 195, 163, 158, 133, 120, 115, 96, 85, 81, 74, 71, 63, 48, 48, 47, 43, 43, 22, 20, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5166823863983154 seconds
Jaccard graph constructed in 0.6855242252349854 seconds
Wrote graph to binary file in 0.062480926513671875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91928
Louvain completed 21 runs in 1.307063341140747 seconds
PhenoGraph complete in 2.595158576965332 seconds
Found communities [-1, ... 21], with sizes: [257, 1075, 446, 235, 170, 160, 159, 157, 144, 126, 121, 87, 80, 69, 64, 62, 58, 47, 44, 29, 26, 18, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5155360698699951 seconds
Jaccard graph constructed in 0.5446298122406006 seconds
Wrote graph to binary file in 0.20676875114440918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91494
Louvain completed 21 runs in 1.2904012203216553 seconds
PhenoGraph complete in 2.576587677001953 seconds
Found communities [-1, ... 25], with sizes: [239, 1146, 354, 230, 190, 177, 141, 135, 131, 117, 88, 78, 77, 74, 65, 63, 47, 46, 45, 43, 42, 34, 22, 19, 18, 13, 13]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6088831424713135 seconds
Jaccard graph constructed in 0.5589909553527832 seconds
Wrote graph to binary file in 0.06251049041748047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919988
After 6 runs, maximum modularity is Q = 0.921072
Louvain completed 26 runs in 1.6807591915130615 seconds
PhenoGraph complete in 2.935931921005249 seconds
Found communities [-1, ... 24], with sizes: [241, 975, 385, 306, 248, 194, 191, 137, 121, 85, 84, 83, 83, 70, 61, 58, 48, 47, 46, 44, 34, 31, 24, 19, 16, 16]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5100588798522949 seconds
Jaccard graph constructed in 0.5263171195983887 seconds
Wrote graph to binary file in 0.19574475288391113 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917977
After 8 runs, maximum modularity is Q = 0.919186
Louvain completed 28 runs in 1.8500137329101562 seconds
PhenoGraph complete in 3.10420298576355 seconds
Found communities [-1, ... 22], with sizes: [236, 1136, 367, 235, 181, 176, 173, 154, 121, 118, 82, 78, 77, 71, 71, 67, 56, 51, 47, 43, 37, 32, 22, 16]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5084021091461182 seconds
Jaccard graph constructed in 0.5386958122253418 seconds
Wrote graph to binary file in 0.060370683670043945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917701
After 3 runs, maximum modularity is Q = 0.918983
Louvain completed 23 runs in 1.5238690376281738 seconds
PhenoGraph complete in 2.6456220149993896 seconds
Found communities [-1, ... 26], with sizes: [244, 1131, 379, 211, 198, 177, 156, 156, 129, 91, 86, 72, 72, 67, 66, 61, 47, 46, 44, 42, 32, 31, 24, 21, 17, 17, 16, 14]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5151913166046143 seconds
Jaccard graph constructed in 0.560924768447876 seconds
Wrote graph to binary file in 0.20106983184814453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919616
Louvain completed 21 runs in 1.2885427474975586 seconds
PhenoGraph complete in 2.5867364406585693 seconds
Found communities [-1, ... 23], with sizes: [236, 1150, 363, 212, 198, 173, 167, 167, 124, 118, 92, 89, 74, 71, 68, 59, 56, 47, 43, 42, 26, 23, 21, 16, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5092031955718994 seconds
Jaccard graph constructed in 0.5330150127410889 seconds
Wrote graph to binary file in 0.05854654312133789 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918719
Louvain completed 21 runs in 1.2392961978912354 seconds
PhenoGraph complete in 2.3549587726593018 seconds
Found communities [-1, ... 25], with sizes: [246, 1137, 321, 248, 221, 185, 160, 150, 123, 119, 84, 83, 76, 76, 62, 52, 48, 44, 41, 32, 29, 25, 21, 20, 19, 14, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5080583095550537 seconds
Jaccard graph constructed in 0.5296463966369629 seconds
Wrote graph to binary file in 0.1806485652923584 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919294
Louvain completed 21 runs in 1.2821345329284668 seconds
PhenoGraph complete in 2.513993263244629 seconds
Found communities [-1, ... 22], with sizes: [220, 1183, 362, 249, 170, 168, 158, 152, 118, 107, 105, 95, 89, 71, 64, 63, 50, 46, 46, 41, 28, 27, 18, 17]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5081584453582764 seconds
Jaccard graph constructed in 0.5443904399871826 seconds
Wrote graph to binary file in 0.059035539627075195 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920404
Louvain completed 21 runs in 1.2713992595672607 seconds
PhenoGraph complete in 2.4024908542633057 seconds
Found communities [-1, ... 23], with sizes: [215, 1118, 481, 204, 174, 165, 161, 143, 127, 118, 112, 88, 77, 69, 68, 60, 48, 48, 47, 37, 34, 17, 13, 12, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.611882209777832 seconds
Jaccard graph constructed in 0.5448915958404541 seconds
Wrote graph to binary file in 0.17878127098083496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920401
Louvain completed 21 runs in 1.2232611179351807 seconds
PhenoGraph complete in 2.578368663787842 seconds
Found communities [-1, ... 25], with sizes: [264, 1095, 350, 238, 201, 190, 176, 166, 124, 99, 95, 84, 81, 68, 59, 46, 46, 44, 42, 40, 32, 22, 20, 20, 18, 14, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5093872547149658 seconds
Jaccard graph constructed in 0.5328578948974609 seconds
Wrote graph to binary file in 0.1910557746887207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918529
Louvain completed 21 runs in 1.3119118213653564 seconds
PhenoGraph complete in 2.56008243560791 seconds
Found communities [-1, ... 27], with sizes: [233, 1125, 392, 220, 188, 177, 174, 159, 97, 91, 88, 88, 78, 63, 59, 57, 55, 47, 44, 37, 29, 28, 24, 22, 20, 15, 13, 13, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.915266752243042 seconds
Jaccard graph constructed in 0.5336551666259766 seconds
Wrote graph to binary file in 0.05922102928161621 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.921915
Louvain completed 21 runs in 1.2477316856384277 seconds
PhenoGraph complete in 2.769258499145508 seconds
Found communities [-1, ... 26], with sizes: [266, 840, 352, 276, 222, 195, 190, 154, 140, 121, 117, 89, 84, 81, 74, 73, 57, 47, 44, 44, 32, 31, 29, 28, 21, 16, 13, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5139877796173096 seconds
Jaccard graph constructed in 0.5335690975189209 seconds
Wrote graph to binary file in 0.19356060028076172 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917977
Louvain completed 21 runs in 1.2736947536468506 seconds
PhenoGraph complete in 2.529722213745117 seconds
Found communities [-1, ... 23], with sizes: [222, 1130, 403, 239, 201, 184, 166, 133, 118, 111, 94, 79, 75, 74, 65, 60, 47, 46, 45, 43, 33, 27, 21, 17, 14]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6103405952453613 seconds
Jaccard graph constructed in 0.534160852432251 seconds
Wrote graph to binary file in 0.05935263633728027 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918903
After 14 runs, maximum modularity is Q = 0.919967
Louvain completed 34 runs in 2.1069209575653076 seconds
PhenoGraph complete in 3.3257455825805664 seconds
Found communities [-1, ... 25], with sizes: [240, 1117, 412, 208, 195, 186, 183, 165, 114, 111, 86, 77, 72, 62, 60, 52, 48, 47, 39, 33, 27, 26, 24, 24, 14, 14, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9112300872802734 seconds
Jaccard graph constructed in 0.5613467693328857 seconds
Wrote graph to binary file in 0.20331883430480957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918831
Louvain completed 21 runs in 1.2733023166656494 seconds
PhenoGraph complete in 2.965576410293579 seconds
Found communities [-1, ... 24], with sizes: [247, 1104, 347, 239, 219, 201, 159, 144, 132, 86, 85, 83, 80, 80, 67, 60, 51, 46, 45, 40, 28, 27, 22, 21, 19, 15]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5155503749847412 seconds
Jaccard graph constructed in 0.5484151840209961 seconds
Wrote graph to binary file in 0.06045269966125488 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919968
Louvain completed 21 runs in 1.283388376235962 seconds
PhenoGraph complete in 2.423159599304199 seconds
Found communities [-1, ... 24], with sizes: [253, 1142, 383, 233, 185, 172, 161, 148, 130, 109, 85, 80, 78, 69, 67, 57, 46, 42, 38, 34, 30, 30, 26, 23, 15, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.611957311630249 seconds
Jaccard graph constructed in 0.554969310760498 seconds
Wrote graph to binary file in 0.20175743103027344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920297
After 2 runs, maximum modularity is Q = 0.921505
Louvain completed 22 runs in 1.5158917903900146 seconds
PhenoGraph complete in 2.899604082107544 seconds
Found communities [-1, ... 25], with sizes: [239, 1132, 374, 228, 192, 159, 152, 152, 141, 100, 87, 73, 72, 64, 62, 60, 60, 47, 46, 46, 34, 29, 27, 21, 20, 16, 14]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6114456653594971 seconds
Jaccard graph constructed in 0.5474045276641846 seconds
Wrote graph to binary file in 0.05917859077453613 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918802
Louvain completed 21 runs in 1.2539029121398926 seconds
PhenoGraph complete in 2.4851999282836914 seconds
Found communities [-1, ... 23], with sizes: [253, 1092, 367, 228, 199, 193, 165, 161, 120, 120, 109, 91, 79, 76, 73, 53, 47, 46, 42, 37, 29, 21, 18, 17, 11]

In [84]:
sc.pp.normalize_per_cell(D326_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Pro1) # log transform the data
D326_Biop_Pro1.raw = D326_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [85]:
D326_Biop_Pro1 = D326_Biop_Pro1[:, D326_Biop_Pro1.var['ribo_genes']]
D326_Biop_Pro1
Out[85]:
View of AnnData object with n_obs × n_vars = 2918 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [86]:
D339_Biop_Pro1 = sc.read_10x_mtx(
    './D339_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D339_Biop_Pro1.var_names_make_unique()
D339_Biop_Pro1.obs['manip'] = 'D339_Biop_Pro1'
D339_Biop_Pro1.obs['position'] = 'Proximal'
D339_Biop_Pro1.obs['method'] = 'Biopsy'
D339_Biop_Pro1.obs['donor'] = 'D339'
D339_Biop_Pro1.obs['name'] = ['D339_Biop_Pro1_' + s for s in list(D339_Biop_Pro1.obs.index)]
D339_Biop_Pro1.obs_names = D339_Biop_Pro1.obs['name']
D339_Biop_Pro1
... reading from cache file ./cache/D339_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[86]:
AnnData object with n_obs × n_vars = 762 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [87]:
sc.pl.highest_expr_genes(D339_Biop_Pro1, n_top=20)
In [88]:
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=0)
mito_genes = D339_Biop_Pro1.var_names.str.startswith('MT-')
D339_Biop_Pro1.obs['percent_mito'] = np.sum(
    D339_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.obs['n_counts'] = D339_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Pro1.to_df())
ribo_genes = D339_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D339_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Pro1.X, axis=1).A1
D339_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D339_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [89]:
sc.pp.filter_cells(D339_Biop_Pro1, min_genes=500)
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['n_counts'] < 40000, :]
D339_Biop_Pro1 = D339_Biop_Pro1[D339_Biop_Pro1.obs['percent_mito'] < 0.2, :]
filtered out 8 cells that have less than 500 genes expressed
In [90]:
# scrublet
scrub = scr.Scrublet(D339_Biop_Pro1.X, expected_doublet_rate=0.006)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D339_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.05
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 36.6%
Overall doublet rate:
	Expected   = 0.6%
	Estimated  = 0.4%
Elapsed time: 0.4 seconds
Out[90]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebf9c2898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb98aba58>],
       dtype=object))
In [91]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Pro1.X).predict()
D339_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1105043888092041 seconds
Jaccard graph constructed in 0.3054227828979492 seconds
Wrote graph to binary file in 0.021872520446777344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871843
Louvain completed 21 runs in 0.8053634166717529 seconds
PhenoGraph complete in 1.2517876625061035 seconds
Found communities [-1, ... 13], with sizes: [108, 112, 98, 90, 77, 65, 65, 65, 54, 51, 47, 33, 31, 28, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11123776435852051 seconds
Jaccard graph constructed in 0.30702853202819824 seconds
Wrote graph to binary file in 0.02108168601989746 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.864588
Louvain completed 21 runs in 0.8700978755950928 seconds
PhenoGraph complete in 1.3188762664794922 seconds
Found communities [-1, ... 12], with sizes: [143, 109, 108, 87, 76, 62, 58, 57, 51, 50, 50, 37, 29, 19]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10672926902770996 seconds
Jaccard graph constructed in 0.30466508865356445 seconds
Wrote graph to binary file in 0.022786855697631836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876061
Louvain completed 21 runs in 0.984729528427124 seconds
PhenoGraph complete in 1.4289076328277588 seconds
Found communities [-1, ... 14], with sizes: [123, 111, 101, 96, 68, 67, 61, 58, 54, 50, 45, 28, 21, 20, 17, 16]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11058402061462402 seconds
Jaccard graph constructed in 0.2976970672607422 seconds
Wrote graph to binary file in 0.018837690353393555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870992
Louvain completed 21 runs in 0.9724931716918945 seconds
PhenoGraph complete in 1.4083311557769775 seconds
Found communities [-1, ... 13], with sizes: [101, 114, 99, 76, 75, 72, 65, 61, 55, 53, 52, 34, 33, 25, 21]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10963940620422363 seconds
Jaccard graph constructed in 0.3056976795196533 seconds
Wrote graph to binary file in 0.021270751953125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870528
Louvain completed 21 runs in 0.8304357528686523 seconds
PhenoGraph complete in 1.2748208045959473 seconds
Found communities [-1, ... 13], with sizes: [125, 110, 103, 87, 83, 71, 58, 56, 49, 48, 45, 31, 28, 27, 15]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11102032661437988 seconds
Jaccard graph constructed in 0.28727102279663086 seconds
Wrote graph to binary file in 0.020878076553344727 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.8745
Louvain completed 21 runs in 0.9878203868865967 seconds
PhenoGraph complete in 1.4170212745666504 seconds
Found communities [-1, ... 14], with sizes: [111, 116, 97, 84, 76, 75, 68, 57, 55, 51, 40, 35, 20, 20, 18, 13]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10653018951416016 seconds
Jaccard graph constructed in 0.3100135326385498 seconds
Wrote graph to binary file in 0.019186973571777344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873832
Louvain completed 21 runs in 0.839667558670044 seconds
PhenoGraph complete in 1.2862062454223633 seconds
Found communities [-1, ... 13], with sizes: [134, 106, 95, 93, 84, 67, 66, 50, 49, 44, 43, 30, 27, 25, 23]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11025357246398926 seconds
Jaccard graph constructed in 0.29729771614074707 seconds
Wrote graph to binary file in 0.19408893585205078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.866812
Louvain completed 21 runs in 0.8526337146759033 seconds
PhenoGraph complete in 1.4625613689422607 seconds
Found communities [-1, ... 13], with sizes: [135, 127, 114, 92, 84, 70, 53, 47, 44, 41, 39, 29, 25, 25, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10551738739013672 seconds
Jaccard graph constructed in 0.30333685874938965 seconds
Wrote graph to binary file in 0.020332813262939453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874799
Louvain completed 21 runs in 0.8151609897613525 seconds
PhenoGraph complete in 1.252237319946289 seconds
Found communities [-1, ... 14], with sizes: [102, 112, 100, 75, 74, 73, 72, 63, 59, 47, 45, 29, 27, 24, 19, 15]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11081314086914062 seconds
Jaccard graph constructed in 0.2903783321380615 seconds
Wrote graph to binary file in 0.017891645431518555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870067
Louvain completed 21 runs in 0.8367900848388672 seconds
PhenoGraph complete in 1.2644429206848145 seconds
Found communities [-1, ... 12], with sizes: [135, 100, 93, 92, 81, 71, 70, 61, 60, 49, 46, 28, 26, 24]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10536527633666992 seconds
Jaccard graph constructed in 0.28545403480529785 seconds
Wrote graph to binary file in 0.020786762237548828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873652
Louvain completed 21 runs in 0.8139078617095947 seconds
PhenoGraph complete in 1.2330331802368164 seconds
Found communities [-1, ... 13], with sizes: [116, 122, 95, 83, 80, 76, 72, 56, 50, 48, 44, 34, 22, 21, 17]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11239171028137207 seconds
Jaccard graph constructed in 0.3006908893585205 seconds
Wrote graph to binary file in 0.02051854133605957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871184
After 2 runs, maximum modularity is Q = 0.872313
After 3 runs, maximum modularity is Q = 0.874645
Louvain completed 23 runs in 1.3586170673370361 seconds
PhenoGraph complete in 1.8027243614196777 seconds
Found communities [-1, ... 13], with sizes: [114, 115, 93, 84, 77, 75, 68, 65, 64, 49, 43, 26, 26, 19, 18]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1068112850189209 seconds
Jaccard graph constructed in 0.29584717750549316 seconds
Wrote graph to binary file in 0.020511865615844727 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87659
Louvain completed 21 runs in 0.8569049835205078 seconds
PhenoGraph complete in 1.2886419296264648 seconds
Found communities [-1, ... 14], with sizes: [99, 129, 94, 85, 78, 61, 59, 56, 52, 51, 49, 36, 31, 24, 19, 13]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10520482063293457 seconds
Jaccard graph constructed in 0.29891347885131836 seconds
Wrote graph to binary file in 0.01911640167236328 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873772
Louvain completed 21 runs in 0.8438427448272705 seconds
PhenoGraph complete in 1.274444341659546 seconds
Found communities [-1, ... 13], with sizes: [102, 113, 92, 88, 79, 69, 62, 56, 52, 48, 47, 40, 34, 30, 24]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10613346099853516 seconds
Jaccard graph constructed in 0.28910326957702637 seconds
Wrote graph to binary file in 0.02112102508544922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869099
After 7 runs, maximum modularity is Q = 0.87019
Louvain completed 27 runs in 1.1831326484680176 seconds
PhenoGraph complete in 1.6093153953552246 seconds
Found communities [-1, ... 13], with sizes: [100, 126, 89, 88, 75, 74, 71, 53, 49, 48, 48, 33, 30, 29, 23]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10950160026550293 seconds
Jaccard graph constructed in 0.2922396659851074 seconds
Wrote graph to binary file in 0.021924734115600586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871361
After 2 runs, maximum modularity is Q = 0.873418
After 6 runs, maximum modularity is Q = 0.874556
Louvain completed 26 runs in 1.2192704677581787 seconds
PhenoGraph complete in 1.6515581607818604 seconds
Found communities [-1, ... 13], with sizes: [115, 115, 108, 85, 77, 73, 61, 53, 51, 50, 48, 30, 25, 23, 22]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1106576919555664 seconds
Jaccard graph constructed in 0.29552173614501953 seconds
Wrote graph to binary file in 0.1611185073852539 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865542
After 2 runs, maximum modularity is Q = 0.869601
After 3 runs, maximum modularity is Q = 0.870671
After 4 runs, maximum modularity is Q = 0.872093
Louvain completed 24 runs in 1.3465197086334229 seconds
PhenoGraph complete in 1.9232587814331055 seconds
Found communities [-1, ... 12], with sizes: [128, 116, 100, 89, 76, 70, 63, 59, 57, 51, 46, 30, 26, 25]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10527610778808594 seconds
Jaccard graph constructed in 0.30194950103759766 seconds
Wrote graph to binary file in 0.019508838653564453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865154
After 2 runs, maximum modularity is Q = 0.866234
Louvain completed 22 runs in 1.0532629489898682 seconds
PhenoGraph complete in 1.490898609161377 seconds
Found communities [-1, ... 13], with sizes: [115, 119, 98, 82, 81, 71, 61, 54, 51, 49, 47, 33, 31, 26, 18]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11021256446838379 seconds
Jaccard graph constructed in 0.2950124740600586 seconds
Wrote graph to binary file in 0.019244670867919922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869625
After 14 runs, maximum modularity is Q = 0.870629
Louvain completed 34 runs in 1.3480546474456787 seconds
PhenoGraph complete in 1.7820680141448975 seconds
Found communities [-1, ... 13], with sizes: [101, 124, 109, 91, 86, 70, 69, 50, 48, 48, 45, 36, 26, 20, 13]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10982561111450195 seconds
Jaccard graph constructed in 0.3057708740234375 seconds
Wrote graph to binary file in 0.01995229721069336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870191
Louvain completed 21 runs in 0.7929477691650391 seconds
PhenoGraph complete in 1.2375876903533936 seconds
Found communities [-1, ... 14], with sizes: [134, 115, 85, 82, 70, 67, 66, 47, 46, 45, 44, 42, 39, 22, 21, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10883021354675293 seconds
Jaccard graph constructed in 0.2888777256011963 seconds
Wrote graph to binary file in 0.019455909729003906 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874177
Louvain completed 21 runs in 0.8041174411773682 seconds
PhenoGraph complete in 1.2286975383758545 seconds
Found communities [-1, ... 12], with sizes: [106, 113, 101, 94, 78, 68, 67, 53, 52, 46, 46, 39, 38, 35]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10550808906555176 seconds
Jaccard graph constructed in 0.29207921028137207 seconds
Wrote graph to binary file in 0.02077507972717285 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868459
After 17 runs, maximum modularity is Q = 0.869754
Louvain completed 37 runs in 1.6946866512298584 seconds
PhenoGraph complete in 2.1203715801239014 seconds
Found communities [-1, ... 13], with sizes: [118, 115, 99, 80, 77, 72, 64, 60, 50, 49, 43, 36, 27, 24, 22]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10578250885009766 seconds
Jaccard graph constructed in 0.3411259651184082 seconds
Wrote graph to binary file in 0.030796527862548828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872573
Louvain completed 21 runs in 0.8219902515411377 seconds
PhenoGraph complete in 1.3151566982269287 seconds
Found communities [-1, ... 12], with sizes: [134, 110, 94, 93, 90, 69, 61, 57, 56, 48, 47, 28, 27, 22]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1129140853881836 seconds
Jaccard graph constructed in 0.2823147773742676 seconds
Wrote graph to binary file in 0.14710569381713867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873892
Louvain completed 21 runs in 0.825577974319458 seconds
PhenoGraph complete in 1.3753232955932617 seconds
Found communities [-1, ... 12], with sizes: [130, 120, 93, 90, 81, 80, 60, 52, 52, 49, 48, 38, 22, 21]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10958337783813477 seconds
Jaccard graph constructed in 0.3104887008666992 seconds
Wrote graph to binary file in 0.020822525024414062 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.86721
After 3 runs, maximum modularity is Q = 0.868278
Louvain completed 23 runs in 1.0420918464660645 seconds
PhenoGraph complete in 1.4943444728851318 seconds
Found communities [-1, ... 14], with sizes: [114, 118, 111, 86, 64, 60, 57, 52, 50, 46, 43, 42, 26, 26, 24, 17]

In [92]:
sc.pp.normalize_per_cell(D339_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Pro1) # log transform the data
D339_Biop_Pro1.raw = D339_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [93]:
D339_Biop_Pro1 = D339_Biop_Pro1[:, D339_Biop_Pro1.var['ribo_genes']]
D339_Biop_Pro1
Out[93]:
View of AnnData object with n_obs × n_vars = 749 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [94]:
D344_Biop_Pro1 = sc.read_10x_mtx(
    './D344_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D344_Biop_Pro1.var_names_make_unique()
D344_Biop_Pro1.obs['manip'] = 'D344_Biop_Pro1'
D344_Biop_Pro1.obs['position'] = 'Proximal'
D344_Biop_Pro1.obs['method'] = 'Biopsy'
D344_Biop_Pro1.obs['donor'] = 'D344'
D344_Biop_Pro1.obs['name'] = ['D344_Biop_Pro1_' + s for s in list(D344_Biop_Pro1.obs.index)]
D344_Biop_Pro1.obs_names = D344_Biop_Pro1.obs['name']
D344_Biop_Pro1
... reading from cache file ./cache/D344_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[94]:
AnnData object with n_obs × n_vars = 313 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [95]:
sc.pl.highest_expr_genes(D344_Biop_Pro1, n_top=20)
In [96]:
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=0)
mito_genes = D344_Biop_Pro1.var_names.str.startswith('MT-')
D344_Biop_Pro1.obs['percent_mito'] = np.sum(
    D344_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.obs['n_counts'] = D344_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Pro1.to_df())
ribo_genes = D344_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D344_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Pro1.X, axis=1).A1
D344_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D344_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [97]:
sc.pp.filter_cells(D344_Biop_Pro1, min_genes=500)
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['n_counts'] < 40000, :]
D344_Biop_Pro1 = D344_Biop_Pro1[D344_Biop_Pro1.obs['percent_mito'] < 0.15, :]
filtered out 7 cells that have less than 500 genes expressed
In [98]:
# scrublet
scrub = scr.Scrublet(D344_Biop_Pro1.X, expected_doublet_rate=0.004)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D344_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.02
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 26.2%
Overall doublet rate:
	Expected   = 0.4%
	Estimated  = 2.5%
Elapsed time: 0.2 seconds
Out[98]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9d3ba58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ebfd1e860>],
       dtype=object))
In [99]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Pro1.X).predict()
D344_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10980510711669922 seconds
Jaccard graph constructed in 0.2169487476348877 seconds
Wrote graph to binary file in 0.007748126983642578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.809501
Louvain completed 21 runs in 0.8823840618133545 seconds
PhenoGraph complete in 1.2258474826812744 seconds
Found communities [-1, ... 6], with sizes: [100, 57, 49, 47, 43, 40, 26, 19]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11079072952270508 seconds
Jaccard graph constructed in 0.22571182250976562 seconds
Wrote graph to binary file in 0.008206605911254883 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.807092
Louvain completed 21 runs in 0.7661454677581787 seconds
PhenoGraph complete in 1.1178562641143799 seconds
Found communities [-1, ... 7], with sizes: [80, 60, 54, 51, 42, 41, 26, 15, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10866880416870117 seconds
Jaccard graph constructed in 0.2189793586730957 seconds
Wrote graph to binary file in 0.012250185012817383 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.804122
Louvain completed 21 runs in 0.7788655757904053 seconds
PhenoGraph complete in 1.1266555786132812 seconds
Found communities [-1, ... 6], with sizes: [77, 70, 56, 50, 48, 35, 34, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10855412483215332 seconds
Jaccard graph constructed in 0.20752477645874023 seconds
Wrote graph to binary file in 0.011024951934814453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.806949
Louvain completed 21 runs in 0.7930207252502441 seconds
PhenoGraph complete in 1.1296765804290771 seconds
Found communities [-1, ... 6], with sizes: [107, 53, 47, 44, 42, 41, 30, 17]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10944557189941406 seconds
Jaccard graph constructed in 0.2146296501159668 seconds
Wrote graph to binary file in 0.010806798934936523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.801462
Louvain completed 21 runs in 0.7459042072296143 seconds
PhenoGraph complete in 1.090465784072876 seconds
Found communities [-1, ... 6], with sizes: [86, 53, 52, 48, 48, 47, 29, 18]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11103200912475586 seconds
Jaccard graph constructed in 0.21146082878112793 seconds
Wrote graph to binary file in 0.009027242660522461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.801157
Louvain completed 21 runs in 0.7538511753082275 seconds
PhenoGraph complete in 1.0926225185394287 seconds
Found communities [-1, ... 8], with sizes: [85, 56, 46, 45, 41, 36, 25, 23, 12, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10889720916748047 seconds
Jaccard graph constructed in 0.20839786529541016 seconds
Wrote graph to binary file in 0.012212038040161133 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.794684
Louvain completed 21 runs in 0.8640484809875488 seconds
PhenoGraph complete in 1.2027547359466553 seconds
Found communities [-1, ... 8], with sizes: [76, 49, 47, 45, 38, 32, 31, 26, 26, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1086585521697998 seconds
Jaccard graph constructed in 0.2087383270263672 seconds
Wrote graph to binary file in 0.013257026672363281 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.802179
Louvain completed 21 runs in 0.7582082748413086 seconds
PhenoGraph complete in 1.0966300964355469 seconds
Found communities [-1, ... 6], with sizes: [87, 62, 55, 47, 44, 42, 33, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11080551147460938 seconds
Jaccard graph constructed in 0.21363544464111328 seconds
Wrote graph to binary file in 0.008567571640014648 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.804379
Louvain completed 21 runs in 0.7859487533569336 seconds
PhenoGraph complete in 1.1277024745941162 seconds
Found communities [-1, ... 8], with sizes: [75, 48, 46, 41, 36, 34, 32, 31, 27, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1095590591430664 seconds
Jaccard graph constructed in 0.22375011444091797 seconds
Wrote graph to binary file in 0.008483171463012695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.814029
Louvain completed 21 runs in 0.8890140056610107 seconds
PhenoGraph complete in 1.2399089336395264 seconds
Found communities [-1, ... 6], with sizes: [106, 55, 45, 43, 41, 40, 27, 24]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10959768295288086 seconds
Jaccard graph constructed in 0.22372150421142578 seconds
Wrote graph to binary file in 0.011902332305908203 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.807101
Louvain completed 21 runs in 0.807478666305542 seconds
PhenoGraph complete in 1.164170503616333 seconds
Found communities [-1, ... 5], with sizes: [101, 69, 54, 47, 44, 33, 33]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1050570011138916 seconds
Jaccard graph constructed in 0.23449349403381348 seconds
Wrote graph to binary file in 0.00814366340637207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.788124
Louvain completed 21 runs in 0.8163847923278809 seconds
PhenoGraph complete in 1.1744191646575928 seconds
Found communities [-1, ... 6], with sizes: [92, 65, 52, 44, 42, 32, 31, 23]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10597991943359375 seconds
Jaccard graph constructed in 0.22053265571594238 seconds
Wrote graph to binary file in 0.008925199508666992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.794395
Louvain completed 21 runs in 0.7983870506286621 seconds
PhenoGraph complete in 1.1408591270446777 seconds
Found communities [-1, ... 6], with sizes: [76, 62, 59, 53, 44, 41, 25, 21]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10756349563598633 seconds
Jaccard graph constructed in 0.21567511558532715 seconds
Wrote graph to binary file in 0.011394977569580078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.80815
Louvain completed 21 runs in 0.9231748580932617 seconds
PhenoGraph complete in 1.266977310180664 seconds
Found communities [-1, ... 7], with sizes: [69, 60, 57, 50, 47, 42, 31, 13, 12]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11142158508300781 seconds
Jaccard graph constructed in 0.2341620922088623 seconds
Wrote graph to binary file in 0.00862741470336914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.815264
Louvain completed 21 runs in 0.8969509601593018 seconds
PhenoGraph complete in 1.258319616317749 seconds
Found communities [-1, ... 7], with sizes: [98, 47, 44, 44, 41, 36, 27, 24, 20]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10501313209533691 seconds
Jaccard graph constructed in 0.2125685214996338 seconds
Wrote graph to binary file in 0.19133543968200684 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.797398
Louvain completed 21 runs in 0.853858470916748 seconds
PhenoGraph complete in 1.3762588500976562 seconds
Found communities [-1, ... 5], with sizes: [79, 72, 55, 54, 50, 45, 26]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11233663558959961 seconds
Jaccard graph constructed in 0.2313520908355713 seconds
Wrote graph to binary file in 0.013152837753295898 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.805367
Louvain completed 21 runs in 0.7601668834686279 seconds
PhenoGraph complete in 1.12748122215271 seconds
Found communities [-1, ... 7], with sizes: [78, 63, 52, 44, 44, 39, 25, 23, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10541272163391113 seconds
Jaccard graph constructed in 0.24375271797180176 seconds
Wrote graph to binary file in 0.012204170227050781 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.79969
Louvain completed 21 runs in 0.7544903755187988 seconds
PhenoGraph complete in 1.1274042129516602 seconds
Found communities [-1, ... 6], with sizes: [99, 64, 45, 44, 41, 35, 29, 24]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11178898811340332 seconds
Jaccard graph constructed in 0.2444000244140625 seconds
Wrote graph to binary file in 0.015204668045043945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.804987
Louvain completed 21 runs in 0.8856735229492188 seconds
PhenoGraph complete in 1.2687458992004395 seconds
Found communities [-1, ... 7], with sizes: [93, 55, 45, 43, 42, 41, 30, 21, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1050407886505127 seconds
Jaccard graph constructed in 0.2237563133239746 seconds
Wrote graph to binary file in 0.0132598876953125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.811552
Louvain completed 21 runs in 0.8793680667877197 seconds
PhenoGraph complete in 1.2291104793548584 seconds
Found communities [-1, ... 6], with sizes: [88, 62, 59, 55, 41, 38, 27, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10558271408081055 seconds
Jaccard graph constructed in 0.22791576385498047 seconds
Wrote graph to binary file in 0.012525081634521484 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.797517
Louvain completed 21 runs in 0.7461190223693848 seconds
PhenoGraph complete in 1.116572618484497 seconds
Found communities [-1, ... 6], with sizes: [88, 55, 53, 50, 43, 35, 31, 26]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10491681098937988 seconds
Jaccard graph constructed in 0.2271714210510254 seconds
Wrote graph to binary file in 0.013199329376220703 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.801786
Louvain completed 21 runs in 0.8835361003875732 seconds
PhenoGraph complete in 1.236802101135254 seconds
Found communities [-1, ... 5], with sizes: [87, 57, 54, 53, 50, 48, 32]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10529279708862305 seconds
Jaccard graph constructed in 0.21335053443908691 seconds
Wrote graph to binary file in 0.013172149658203125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.815623
Louvain completed 21 runs in 0.7665512561798096 seconds
PhenoGraph complete in 1.106257677078247 seconds
Found communities [-1, ... 5], with sizes: [105, 59, 57, 47, 45, 42, 26]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10492658615112305 seconds
Jaccard graph constructed in 0.23510122299194336 seconds
Wrote graph to binary file in 0.01239776611328125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.800669
Louvain completed 21 runs in 0.7814671993255615 seconds
PhenoGraph complete in 1.1406383514404297 seconds
Found communities [-1, ... 6], with sizes: [89, 63, 59, 46, 43, 40, 30, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10491538047790527 seconds
Jaccard graph constructed in 0.23580455780029297 seconds
Wrote graph to binary file in 0.011727333068847656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.806018
Louvain completed 21 runs in 0.7661678791046143 seconds
PhenoGraph complete in 1.1444058418273926 seconds
Found communities [-1, ... 5], with sizes: [94, 59, 58, 48, 46, 43, 33]

In [100]:
sc.pp.normalize_per_cell(D344_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Pro1) # log transform the data
D344_Biop_Pro1.raw = D344_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [101]:
D344_Biop_Pro1 = D344_Biop_Pro1[:, D344_Biop_Pro1.var['ribo_genes']]
D344_Biop_Pro1
Out[101]:
View of AnnData object with n_obs × n_vars = 305 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [102]:
D353_Biop_Pro1 = sc.read_10x_mtx(
    './D353_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D353_Biop_Pro1.var_names_make_unique()
D353_Biop_Pro1.obs['manip'] = 'D353_Biop_Pro1'
D353_Biop_Pro1.obs['position'] = 'Proximal'
D353_Biop_Pro1.obs['method'] = 'Biopsy'
D353_Biop_Pro1.obs['donor'] = 'D353'
D353_Biop_Pro1.obs['name'] = ['D353_Biop_Pro1' + s for s in list(D353_Biop_Pro1.obs.index)]
D353_Biop_Pro1.obs_names = D353_Biop_Pro1.obs['name']
D353_Biop_Pro1
... reading from cache file ./cache/D353_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[102]:
AnnData object with n_obs × n_vars = 4234 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [103]:
sc.pl.highest_expr_genes(D353_Biop_Pro1, n_top=20)
In [104]:
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=0)
mito_genes = D353_Biop_Pro1.var_names.str.startswith('MT-')
D353_Biop_Pro1.obs['percent_mito'] = np.sum(
    D353_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.obs['n_counts'] = D353_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Pro1.to_df())
ribo_genes = D353_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D353_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Pro1.X, axis=1).A1
D353_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D353_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [105]:
sc.pp.filter_cells(D353_Biop_Pro1, min_genes=500)
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['n_counts'] < 15000, :]
D353_Biop_Pro1 = D353_Biop_Pro1[D353_Biop_Pro1.obs['percent_mito'] < 0.25, :]
filtered out 41 cells that have less than 500 genes expressed
In [106]:
# scrublet
scrub = scr.Scrublet(D353_Biop_Pro1.X, expected_doublet_rate=0.032)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D353_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.39
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 17.4%
Overall doublet rate:
	Expected   = 3.2%
	Estimated  = 2.5%
Elapsed time: 2.6 seconds
Out[106]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9f09240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb943e438>],
       dtype=object))
In [107]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Biop_Pro1.X).predict()
D353_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1124141216278076 seconds
Jaccard graph constructed in 0.6793932914733887 seconds
Wrote graph to binary file in 0.07717013359069824 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898435
Louvain completed 21 runs in 1.5983195304870605 seconds
PhenoGraph complete in 3.4864113330841064 seconds
Found communities [-1, ... 20], with sizes: [293, 1875, 602, 412, 354, 339, 261, 206, 169, 143, 106, 103, 64, 56, 46, 37, 36, 35, 34, 29, 22, 15]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.5156469345092773 seconds
Jaccard graph constructed in 0.6703004837036133 seconds
Wrote graph to binary file in 0.23721718788146973 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904477
Louvain completed 21 runs in 1.6786298751831055 seconds
PhenoGraph complete in 4.118005275726318 seconds
Found communities [-1, ... 18], with sizes: [263, 1908, 707, 404, 333, 314, 240, 238, 187, 111, 98, 80, 62, 61, 58, 46, 45, 38, 24, 20]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3126440048217773 seconds
Jaccard graph constructed in 0.695244550704956 seconds
Wrote graph to binary file in 0.2376270294189453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903017
Louvain completed 21 runs in 1.601078987121582 seconds
PhenoGraph complete in 3.8658385276794434 seconds
Found communities [-1, ... 19], with sizes: [256, 1906, 629, 418, 359, 349, 348, 253, 173, 88, 72, 58, 54, 47, 45, 40, 38, 37, 30, 23, 14]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3111910820007324 seconds
Jaccard graph constructed in 0.6545121669769287 seconds
Wrote graph to binary file in 0.07905054092407227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894391
After 3 runs, maximum modularity is Q = 0.895731
Louvain completed 23 runs in 2.066336154937744 seconds
PhenoGraph complete in 4.128947019577026 seconds
Found communities [-1, ... 20], with sizes: [254, 1881, 670, 411, 371, 328, 247, 178, 126, 115, 112, 110, 109, 54, 46, 41, 38, 38, 35, 33, 21, 19]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1168477535247803 seconds
Jaccard graph constructed in 0.6786074638366699 seconds
Wrote graph to binary file in 0.23377561569213867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898554
Louvain completed 21 runs in 1.6866297721862793 seconds
PhenoGraph complete in 3.7325732707977295 seconds
Found communities [-1, ... 18], with sizes: [247, 1953, 582, 418, 399, 368, 328, 263, 152, 108, 77, 58, 50, 46, 46, 39, 36, 30, 25, 12]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2134339809417725 seconds
Jaccard graph constructed in 0.6675515174865723 seconds
Wrote graph to binary file in 0.23641681671142578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903043
Louvain completed 21 runs in 1.6272809505462646 seconds
PhenoGraph complete in 3.7634260654449463 seconds
Found communities [-1, ... 18], with sizes: [265, 1929, 662, 417, 383, 333, 315, 183, 171, 109, 90, 65, 63, 48, 47, 42, 41, 32, 24, 18]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.214933156967163 seconds
Jaccard graph constructed in 0.672339677810669 seconds
Wrote graph to binary file in 0.07712316513061523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902923
Louvain completed 21 runs in 1.6528730392456055 seconds
PhenoGraph complete in 3.6350204944610596 seconds
Found communities [-1, ... 21], with sizes: [250, 1901, 666, 397, 348, 316, 262, 220, 211, 121, 104, 84, 63, 46, 44, 39, 32, 31, 28, 26, 23, 13, 12]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1127710342407227 seconds
Jaccard graph constructed in 0.6458017826080322 seconds
Wrote graph to binary file in 0.21735024452209473 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898646
Louvain completed 21 runs in 1.7578880786895752 seconds
PhenoGraph complete in 3.752281427383423 seconds
Found communities [-1, ... 18], with sizes: [232, 1873, 674, 409, 378, 367, 349, 270, 110, 105, 84, 61, 57, 47, 46, 45, 40, 39, 31, 20]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3129191398620605 seconds
Jaccard graph constructed in 0.6800203323364258 seconds
Wrote graph to binary file in 0.0765378475189209 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895991
Louvain completed 21 runs in 1.6098217964172363 seconds
PhenoGraph complete in 3.696824312210083 seconds
Found communities [-1, ... 19], with sizes: [258, 1929, 581, 399, 387, 333, 249, 241, 189, 137, 110, 83, 55, 54, 47, 40, 37, 31, 29, 27, 21]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.314307451248169 seconds
Jaccard graph constructed in 0.8422415256500244 seconds
Wrote graph to binary file in 0.0751194953918457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895983
After 15 runs, maximum modularity is Q = 0.897442
Louvain completed 35 runs in 2.833246946334839 seconds
PhenoGraph complete in 5.0813775062561035 seconds
Found communities [-1, ... 21], with sizes: [298, 1898, 600, 407, 355, 335, 251, 216, 191, 114, 112, 68, 57, 47, 46, 45, 41, 40, 38, 30, 24, 13, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3129487037658691 seconds
Jaccard graph constructed in 0.669677734375 seconds
Wrote graph to binary file in 0.24205803871154785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902166
Louvain completed 21 runs in 1.6867096424102783 seconds
PhenoGraph complete in 3.930549144744873 seconds
Found communities [-1, ... 19], with sizes: [248, 1867, 677, 403, 394, 382, 351, 157, 109, 106, 79, 77, 58, 57, 51, 47, 46, 39, 39, 32, 18]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3168745040893555 seconds
Jaccard graph constructed in 0.7001020908355713 seconds
Wrote graph to binary file in 0.07872438430786133 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897693
Louvain completed 21 runs in 1.6620802879333496 seconds
PhenoGraph complete in 3.7772116661071777 seconds
Found communities [-1, ... 16], with sizes: [250, 1954, 623, 417, 413, 352, 340, 266, 111, 98, 69, 68, 65, 55, 48, 44, 34, 30]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3143353462219238 seconds
Jaccard graph constructed in 0.6794734001159668 seconds
Wrote graph to binary file in 0.23280668258666992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905778
Louvain completed 21 runs in 1.6485168933868408 seconds
PhenoGraph complete in 3.8949456214904785 seconds
Found communities [-1, ... 19], with sizes: [299, 1903, 625, 404, 389, 361, 334, 250, 104, 85, 84, 71, 59, 51, 49, 46, 32, 32, 26, 21, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3123993873596191 seconds
Jaccard graph constructed in 0.6693263053894043 seconds
Wrote graph to binary file in 0.21507835388183594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901388
Louvain completed 21 runs in 1.5971555709838867 seconds
PhenoGraph complete in 3.8123550415039062 seconds
Found communities [-1, ... 17], with sizes: [295, 1912, 653, 384, 352, 345, 250, 195, 179, 149, 104, 85, 81, 52, 52, 46, 41, 39, 23]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3126029968261719 seconds
Jaccard graph constructed in 0.6866474151611328 seconds
Wrote graph to binary file in 0.07668447494506836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898908
Louvain completed 21 runs in 1.5656707286834717 seconds
PhenoGraph complete in 3.6590311527252197 seconds
Found communities [-1, ... 18], with sizes: [267, 1946, 655, 401, 387, 337, 332, 260, 103, 86, 78, 74, 67, 57, 45, 41, 40, 28, 21, 12]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.4116289615631104 seconds
Jaccard graph constructed in 0.6775290966033936 seconds
Wrote graph to binary file in 0.23298907279968262 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901293
After 17 runs, maximum modularity is Q = 0.90241
Louvain completed 37 runs in 3.1329915523529053 seconds
PhenoGraph complete in 5.473968029022217 seconds
Found communities [-1, ... 20], with sizes: [272, 1909, 631, 409, 359, 333, 229, 185, 168, 134, 113, 91, 59, 57, 56, 48, 43, 38, 31, 27, 27, 18]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3149909973144531 seconds
Jaccard graph constructed in 0.6827573776245117 seconds
Wrote graph to binary file in 0.24138998985290527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.898244
Louvain completed 21 runs in 1.6458306312561035 seconds
PhenoGraph complete in 3.905277729034424 seconds
Found communities [-1, ... 20], with sizes: [264, 1854, 703, 359, 354, 347, 259, 201, 197, 106, 88, 87, 63, 62, 58, 47, 45, 43, 33, 28, 21, 18]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3126351833343506 seconds
Jaccard graph constructed in 0.6465389728546143 seconds
Wrote graph to binary file in 0.07648181915283203 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902466
After 7 runs, maximum modularity is Q = 0.903564
Louvain completed 27 runs in 2.278730630874634 seconds
PhenoGraph complete in 4.331044673919678 seconds
Found communities [-1, ... 17], with sizes: [256, 1878, 654, 441, 353, 314, 250, 248, 206, 155, 101, 81, 60, 49, 46, 46, 41, 35, 23]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3133964538574219 seconds
Jaccard graph constructed in 0.6469383239746094 seconds
Wrote graph to binary file in 0.24372434616088867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902293
After 2 runs, maximum modularity is Q = 0.904136
Louvain completed 22 runs in 1.887833595275879 seconds
PhenoGraph complete in 4.10889196395874 seconds
Found communities [-1, ... 20], with sizes: [248, 1930, 678, 416, 383, 373, 300, 184, 105, 102, 92, 70, 46, 46, 42, 42, 40, 36, 32, 28, 22, 22]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3120794296264648 seconds
Jaccard graph constructed in 0.6420383453369141 seconds
Wrote graph to binary file in 0.07879233360290527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899317
After 2 runs, maximum modularity is Q = 0.901301
Louvain completed 22 runs in 1.9003074169158936 seconds
PhenoGraph complete in 3.951625108718872 seconds
Found communities [-1, ... 19], with sizes: [278, 1924, 615, 430, 351, 344, 240, 179, 146, 104, 94, 91, 86, 72, 60, 48, 46, 38, 37, 35, 19]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3140530586242676 seconds
Jaccard graph constructed in 0.6724953651428223 seconds
Wrote graph to binary file in 0.23605895042419434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896487
Louvain completed 21 runs in 1.590254306793213 seconds
PhenoGraph complete in 3.8298606872558594 seconds
Found communities [-1, ... 18], with sizes: [278, 1956, 627, 418, 347, 337, 246, 162, 135, 112, 103, 89, 82, 81, 59, 58, 42, 39, 35, 31]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.3131976127624512 seconds
Jaccard graph constructed in 0.6590533256530762 seconds
Wrote graph to binary file in 0.2455885410308838 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894787
Louvain completed 21 runs in 1.6014740467071533 seconds
PhenoGraph complete in 3.8447322845458984 seconds
Found communities [-1, ... 16], with sizes: [263, 1941, 636, 409, 373, 360, 344, 249, 188, 133, 62, 54, 52, 46, 44, 34, 26, 23]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9126551151275635 seconds
Jaccard graph constructed in 0.675278902053833 seconds
Wrote graph to binary file in 0.07746315002441406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899812
After 3 runs, maximum modularity is Q = 0.901179
Louvain completed 23 runs in 1.9313156604766846 seconds
PhenoGraph complete in 3.61487078666687 seconds
Found communities [-1, ... 17], with sizes: [250, 1906, 637, 412, 412, 347, 340, 263, 130, 109, 103, 56, 56, 46, 41, 39, 36, 27, 27]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.5129027366638184 seconds
Jaccard graph constructed in 0.6978867053985596 seconds
Wrote graph to binary file in 0.23975682258605957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901892
Louvain completed 21 runs in 1.7766077518463135 seconds
PhenoGraph complete in 4.2457115650177 seconds
Found communities [-1, ... 19], with sizes: [300, 1891, 613, 428, 394, 358, 335, 246, 100, 100, 85, 77, 56, 50, 47, 41, 32, 26, 24, 23, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.316267490386963 seconds
Jaccard graph constructed in 0.6971895694732666 seconds
Wrote graph to binary file in 0.07730555534362793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900887
Louvain completed 21 runs in 1.6947228908538818 seconds
PhenoGraph complete in 3.8027455806732178 seconds
Found communities [-1, ... 17], with sizes: [280, 1927, 636, 412, 377, 350, 347, 256, 157, 104, 81, 54, 46, 45, 45, 41, 38, 23, 18]

In [108]:
sc.pp.normalize_per_cell(D353_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Pro1) # log transform the data
D353_Biop_Pro1.raw = D353_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [109]:
D353_Biop_Pro1 = D353_Biop_Pro1[:, D353_Biop_Pro1.var['ribo_genes']]
D353_Biop_Pro1
Out[109]:
View of AnnData object with n_obs × n_vars = 4190 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [110]:
D354_Biop_Pro1 = sc.read_10x_mtx(
    './D354_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D354_Biop_Pro1.var_names_make_unique()
D354_Biop_Pro1.obs['manip'] = 'D354_Biop_Pro1'
D354_Biop_Pro1.obs['position'] = 'Proximal'
D354_Biop_Pro1.obs['method'] = 'Biopsy'
D354_Biop_Pro1.obs['donor'] = 'D354'
D354_Biop_Pro1.obs['name'] = ['D354_Biop_Pro1_' + s for s in list(D354_Biop_Pro1.obs.index)]
D354_Biop_Pro1.obs_names = D354_Biop_Pro1.obs['name']
D354_Biop_Pro1
... reading from cache file ./cache/D354_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[110]:
AnnData object with n_obs × n_vars = 1877 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [111]:
sc.pl.highest_expr_genes(D354_Biop_Pro1, n_top=20)
In [112]:
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=0)
mito_genes = D354_Biop_Pro1.var_names.str.startswith('MT-')
D354_Biop_Pro1.obs['percent_mito'] = np.sum(
    D354_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.obs['n_counts'] = D354_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Pro1.to_df())
ribo_genes = D354_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D354_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Pro1.X, axis=1).A1
D354_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D354_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [113]:
sc.pp.filter_cells(D354_Biop_Pro1, min_genes=500)
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['n_counts'] < 30000, :]
D354_Biop_Pro1 = D354_Biop_Pro1[D354_Biop_Pro1.obs['percent_mito'] < 0.15, :]
filtered out 57 cells that have less than 500 genes expressed
In [114]:
# scrublet
scrub = scr.Scrublet(D354_Biop_Pro1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D354_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.17
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 34.1%
Overall doublet rate:
	Expected   = 1.6%
	Estimated  = 2.1%
Elapsed time: 1.2 seconds
Out[114]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb4405940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb9cbff60>],
       dtype=object))
In [115]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Biop_Pro1.X).predict()
D354_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20798563957214355 seconds
Jaccard graph constructed in 0.43631482124328613 seconds
Wrote graph to binary file in 0.03902935981750488 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894808
Louvain completed 21 runs in 1.1114189624786377 seconds
PhenoGraph complete in 1.8260626792907715 seconds
Found communities [-1, ... 20], with sizes: [224, 369, 250, 203, 158, 151, 149, 135, 107, 85, 65, 60, 57, 54, 51, 31, 24, 22, 20, 18, 15, 13]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3108201026916504 seconds
Jaccard graph constructed in 0.4447774887084961 seconds
Wrote graph to binary file in 0.03623771667480469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89544
After 6 runs, maximum modularity is Q = 0.896991
Louvain completed 26 runs in 1.4794540405273438 seconds
PhenoGraph complete in 2.283229351043701 seconds
Found communities [-1, ... 21], with sizes: [226, 349, 251, 201, 161, 147, 139, 124, 97, 80, 79, 61, 57, 56, 45, 42, 25, 24, 23, 23, 20, 18, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3073890209197998 seconds
Jaccard graph constructed in 0.43344998359680176 seconds
Wrote graph to binary file in 0.23255109786987305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900435
Louvain completed 21 runs in 1.0921292304992676 seconds
PhenoGraph complete in 2.0783698558807373 seconds
Found communities [-1, ... 23], with sizes: [220, 348, 253, 174, 139, 131, 115, 99, 95, 94, 92, 91, 71, 61, 56, 55, 33, 24, 23, 22, 16, 15, 12, 11, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3080592155456543 seconds
Jaccard graph constructed in 0.4336819648742676 seconds
Wrote graph to binary file in 0.036360979080200195 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897128
After 2 runs, maximum modularity is Q = 0.899447
Louvain completed 22 runs in 1.2966358661651611 seconds
PhenoGraph complete in 2.0864593982696533 seconds
Found communities [-1, ... 21], with sizes: [199, 344, 245, 175, 172, 124, 123, 117, 109, 93, 92, 75, 74, 62, 56, 51, 46, 22, 21, 18, 17, 15, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30846166610717773 seconds
Jaccard graph constructed in 0.4344816207885742 seconds
Wrote graph to binary file in 0.035085201263427734 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895133
After 3 runs, maximum modularity is Q = 0.896925
Louvain completed 23 runs in 1.3573853969573975 seconds
PhenoGraph complete in 2.1456351280212402 seconds
Found communities [-1, ... 20], with sizes: [226, 352, 261, 185, 174, 135, 117, 115, 97, 92, 86, 59, 57, 53, 52, 49, 41, 31, 24, 21, 18, 16]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20606327056884766 seconds
Jaccard graph constructed in 0.4269881248474121 seconds
Wrote graph to binary file in 0.199462890625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895028
Louvain completed 21 runs in 1.0952200889587402 seconds
PhenoGraph complete in 1.9396007061004639 seconds
Found communities [-1, ... 20], with sizes: [223, 367, 265, 175, 173, 122, 114, 112, 99, 93, 85, 61, 57, 56, 56, 46, 36, 35, 26, 23, 20, 17]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30696773529052734 seconds
Jaccard graph constructed in 0.4124562740325928 seconds
Wrote graph to binary file in 0.03569531440734863 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892923
After 2 runs, maximum modularity is Q = 0.894562
Louvain completed 22 runs in 1.2908604145050049 seconds
PhenoGraph complete in 2.0582950115203857 seconds
Found communities [-1, ... 19], with sizes: [215, 358, 335, 256, 183, 142, 134, 125, 113, 91, 60, 57, 47, 24, 24, 21, 19, 18, 16, 12, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3103320598602295 seconds
Jaccard graph constructed in 0.42313146591186523 seconds
Wrote graph to binary file in 0.03623080253601074 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892364
After 2 runs, maximum modularity is Q = 0.894884
Louvain completed 22 runs in 1.3216302394866943 seconds
PhenoGraph complete in 2.1039392948150635 seconds
Found communities [-1, ... 20], with sizes: [203, 369, 285, 187, 177, 168, 110, 110, 109, 93, 84, 59, 57, 52, 45, 40, 27, 24, 19, 16, 14, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30880260467529297 seconds
Jaccard graph constructed in 0.4333014488220215 seconds
Wrote graph to binary file in 0.034192800521850586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893551
Louvain completed 21 runs in 1.118133783340454 seconds
PhenoGraph complete in 1.9052391052246094 seconds
Found communities [-1, ... 20], with sizes: [227, 358, 328, 229, 170, 156, 97, 90, 83, 82, 78, 63, 56, 55, 35, 31, 29, 25, 23, 21, 14, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20642876625061035 seconds
Jaccard graph constructed in 0.43306493759155273 seconds
Wrote graph to binary file in 0.19727802276611328 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890663
After 2 runs, maximum modularity is Q = 0.893039
Louvain completed 22 runs in 1.3608157634735107 seconds
PhenoGraph complete in 2.2119898796081543 seconds
Found communities [-1, ... 20], with sizes: [186, 356, 354, 193, 187, 161, 123, 106, 97, 71, 64, 61, 57, 56, 49, 33, 25, 24, 18, 17, 12, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3070404529571533 seconds
Jaccard graph constructed in 0.419095516204834 seconds
Wrote graph to binary file in 0.03542590141296387 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893239
Louvain completed 21 runs in 1.0821685791015625 seconds
PhenoGraph complete in 1.8562963008880615 seconds
Found communities [-1, ... 18], with sizes: [244, 372, 363, 189, 184, 134, 131, 123, 122, 73, 65, 60, 57, 41, 26, 23, 17, 14, 12, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3066115379333496 seconds
Jaccard graph constructed in 0.44223952293395996 seconds
Wrote graph to binary file in 0.0375819206237793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892366
Louvain completed 21 runs in 1.1158223152160645 seconds
PhenoGraph complete in 1.9144954681396484 seconds
Found communities [-1, ... 19], with sizes: [258, 331, 247, 244, 153, 129, 122, 100, 100, 86, 77, 77, 58, 52, 47, 41, 38, 37, 23, 22, 19]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3083064556121826 seconds
Jaccard graph constructed in 0.4503061771392822 seconds
Wrote graph to binary file in 0.19364523887634277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895026
After 3 runs, maximum modularity is Q = 0.896758
Louvain completed 23 runs in 1.326650619506836 seconds
PhenoGraph complete in 2.2930917739868164 seconds
Found communities [-1, ... 21], with sizes: [247, 326, 258, 191, 164, 161, 126, 113, 101, 81, 79, 61, 58, 57, 53, 38, 28, 25, 25, 19, 18, 18, 14]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2068653106689453 seconds
Jaccard graph constructed in 0.42003607749938965 seconds
Wrote graph to binary file in 0.03583359718322754 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893939
Louvain completed 21 runs in 1.0809791088104248 seconds
PhenoGraph complete in 1.7563679218292236 seconds
Found communities [-1, ... 21], with sizes: [228, 347, 223, 187, 174, 164, 164, 116, 96, 96, 80, 59, 55, 49, 44, 41, 36, 25, 19, 16, 15, 14, 13]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30782556533813477 seconds
Jaccard graph constructed in 0.42464208602905273 seconds
Wrote graph to binary file in 0.036917686462402344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895056
Louvain completed 21 runs in 1.0969853401184082 seconds
PhenoGraph complete in 1.878363847732544 seconds
Found communities [-1, ... 20], with sizes: [222, 363, 328, 190, 164, 164, 136, 105, 89, 67, 58, 55, 54, 51, 46, 37, 32, 26, 22, 21, 17, 14]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20641303062438965 seconds
Jaccard graph constructed in 0.4524691104888916 seconds
Wrote graph to binary file in 0.20171022415161133 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893403
Louvain completed 21 runs in 1.10914945602417 seconds
PhenoGraph complete in 1.9848384857177734 seconds
Found communities [-1, ... 19], with sizes: [212, 351, 264, 210, 192, 145, 119, 112, 103, 98, 70, 67, 60, 56, 55, 54, 25, 20, 20, 17, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3071401119232178 seconds
Jaccard graph constructed in 0.4279038906097412 seconds
Wrote graph to binary file in 0.03468894958496094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893619
After 2 runs, maximum modularity is Q = 0.895078
Louvain completed 22 runs in 1.3012712001800537 seconds
PhenoGraph complete in 2.0826313495635986 seconds
Found communities [-1, ... 18], with sizes: [237, 349, 252, 226, 202, 162, 149, 114, 105, 101, 86, 59, 56, 40, 28, 22, 22, 19, 17, 15]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3069031238555908 seconds
Jaccard graph constructed in 0.42010951042175293 seconds
Wrote graph to binary file in 0.03604316711425781 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89463
Louvain completed 21 runs in 1.1117844581604004 seconds
PhenoGraph complete in 1.886242151260376 seconds
Found communities [-1, ... 20], with sizes: [243, 347, 235, 196, 187, 129, 121, 111, 97, 90, 79, 76, 56, 52, 52, 49, 48, 24, 23, 19, 16, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3070688247680664 seconds
Jaccard graph constructed in 0.42629456520080566 seconds
Wrote graph to binary file in 0.03327345848083496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893728
Louvain completed 21 runs in 1.0781338214874268 seconds
PhenoGraph complete in 1.8543238639831543 seconds
Found communities [-1, ... 23], with sizes: [196, 352, 244, 240, 164, 160, 136, 114, 99, 81, 71, 56, 54, 51, 37, 27, 26, 26, 23, 23, 22, 20, 16, 12, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3070995807647705 seconds
Jaccard graph constructed in 0.5920555591583252 seconds
Wrote graph to binary file in 0.03378438949584961 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894982
After 4 runs, maximum modularity is Q = 0.896278
After 5 runs, maximum modularity is Q = 0.897416
Louvain completed 25 runs in 1.6029915809631348 seconds
PhenoGraph complete in 2.5494496822357178 seconds
Found communities [-1, ... 19], with sizes: [250, 357, 244, 182, 165, 163, 142, 123, 103, 88, 86, 62, 55, 55, 40, 36, 33, 23, 20, 18, 16]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3067197799682617 seconds
Jaccard graph constructed in 0.43877077102661133 seconds
Wrote graph to binary file in 0.0335230827331543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895023
After 6 runs, maximum modularity is Q = 0.89621
Louvain completed 26 runs in 1.4676036834716797 seconds
PhenoGraph complete in 2.2580513954162598 seconds
Found communities [-1, ... 22], with sizes: [211, 343, 223, 190, 186, 155, 128, 121, 100, 91, 79, 78, 57, 54, 53, 46, 24, 23, 22, 22, 17, 14, 12, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31211042404174805 seconds
Jaccard graph constructed in 0.44854235649108887 seconds
Wrote graph to binary file in 0.03439927101135254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893999
After 2 runs, maximum modularity is Q = 0.895689
Louvain completed 22 runs in 1.305464506149292 seconds
PhenoGraph complete in 2.1120829582214355 seconds
Found communities [-1, ... 23], with sizes: [230, 340, 252, 192, 165, 133, 111, 109, 100, 96, 94, 69, 55, 51, 50, 44, 29, 24, 21, 21, 21, 16, 15, 12, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3101367950439453 seconds
Jaccard graph constructed in 0.44313836097717285 seconds
Wrote graph to binary file in 0.19430088996887207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891348
After 3 runs, maximum modularity is Q = 0.892816
After 4 runs, maximum modularity is Q = 0.895152
Louvain completed 24 runs in 1.5397734642028809 seconds
PhenoGraph complete in 2.5027737617492676 seconds
Found communities [-1, ... 20], with sizes: [201, 346, 270, 200, 165, 120, 116, 112, 112, 105, 98, 76, 57, 54, 53, 53, 27, 27, 24, 17, 16, 12]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30808544158935547 seconds
Jaccard graph constructed in 0.44527363777160645 seconds
Wrote graph to binary file in 0.034067392349243164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894172
After 4 runs, maximum modularity is Q = 0.895367
After 19 runs, maximum modularity is Q = 0.896371
Louvain completed 39 runs in 2.2144389152526855 seconds
PhenoGraph complete in 3.0123860836029053 seconds
Found communities [-1, ... 17], with sizes: [247, 361, 253, 231, 175, 169, 127, 113, 113, 106, 60, 55, 53, 51, 47, 33, 26, 25, 16]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20731735229492188 seconds
Jaccard graph constructed in 0.42447829246520996 seconds
Wrote graph to binary file in 0.03425025939941406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896583
Louvain completed 21 runs in 1.082287311553955 seconds
PhenoGraph complete in 1.761446475982666 seconds
Found communities [-1, ... 20], with sizes: [241, 334, 251, 192, 178, 128, 123, 105, 102, 93, 92, 58, 56, 48, 43, 43, 40, 38, 25, 24, 24, 23]

In [116]:
sc.pp.normalize_per_cell(D354_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Pro1) # log transform the data
D354_Biop_Pro1.raw = D354_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [117]:
D354_Biop_Pro1 = D354_Biop_Pro1[:, D354_Biop_Pro1.var['ribo_genes']]
D354_Biop_Pro1
Out[117]:
View of AnnData object with n_obs × n_vars = 1809 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [118]:
D363_Biop_Pro1 = sc.read_10x_mtx(
    './D363_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D363_Biop_Pro1.var_names_make_unique()
D363_Biop_Pro1.obs['manip'] = 'D363_Biop_Pro1'
D363_Biop_Pro1.obs['position'] = 'Proximal'
D363_Biop_Pro1.obs['method'] = 'Biopsy'
D363_Biop_Pro1.obs['donor'] = 'D363'
D363_Biop_Pro1.obs['name'] = ['D363_Biop_Pro1_' + s for s in list(D363_Biop_Pro1.obs.index)]
D363_Biop_Pro1.obs_names = D363_Biop_Pro1.obs['name']
D363_Biop_Pro1
... reading from cache file ./cache/D363_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[118]:
AnnData object with n_obs × n_vars = 1531 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [119]:
sc.pl.highest_expr_genes(D363_Biop_Pro1, n_top=20)
In [120]:
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=0)
mito_genes = D363_Biop_Pro1.var_names.str.startswith('MT-')
D363_Biop_Pro1.obs['percent_mito'] = np.sum(
    D363_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.obs['n_counts'] = D363_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Pro1.to_df())
ribo_genes = D363_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D363_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Pro1.X, axis=1).A1
D363_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D363_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [121]:
sc.pp.filter_cells(D363_Biop_Pro1, min_genes=500)
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['n_counts'] < 15000, :]
D363_Biop_Pro1 = D363_Biop_Pro1[D363_Biop_Pro1.obs['percent_mito'] < 0.25, :]
filtered out 20 cells that have less than 500 genes expressed
In [122]:
# scrublet
scrub = scr.Scrublet(D363_Biop_Pro1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D363_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.12
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 17.0%
Overall doublet rate:
	Expected   = 1.1%
	Estimated  = 3.9%
Elapsed time: 0.8 seconds
Out[122]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb46fca90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb65d5f98>],
       dtype=object))
In [123]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Biop_Pro1.X).predict()
D363_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30674242973327637 seconds
Jaccard graph constructed in 0.4128077030181885 seconds
Wrote graph to binary file in 0.02238774299621582 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869635
Louvain completed 21 runs in 1.0842034816741943 seconds
PhenoGraph complete in 1.836641788482666 seconds
Found communities [-1, ... 14], with sizes: [279, 496, 313, 213, 140, 76, 64, 62, 58, 41, 35, 28, 25, 19, 18, 18]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20850038528442383 seconds
Jaccard graph constructed in 0.43157458305358887 seconds
Wrote graph to binary file in 0.02245807647705078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871286
Louvain completed 21 runs in 1.1038267612457275 seconds
PhenoGraph complete in 1.7786390781402588 seconds
Found communities [-1, ... 15], with sizes: [205, 528, 265, 168, 148, 98, 90, 71, 63, 47, 40, 39, 28, 26, 26, 24, 19]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20830273628234863 seconds
Jaccard graph constructed in 0.4142801761627197 seconds
Wrote graph to binary file in 0.19663286209106445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868827
Louvain completed 21 runs in 1.0925955772399902 seconds
PhenoGraph complete in 1.9218556880950928 seconds
Found communities [-1, ... 14], with sizes: [238, 509, 216, 157, 146, 138, 70, 69, 68, 64, 59, 40, 39, 30, 24, 18]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3095426559448242 seconds
Jaccard graph constructed in 0.43410587310791016 seconds
Wrote graph to binary file in 0.025043487548828125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869482
Louvain completed 21 runs in 1.117567539215088 seconds
PhenoGraph complete in 1.898071050643921 seconds
Found communities [-1, ... 13], with sizes: [251, 532, 271, 258, 97, 96, 69, 66, 63, 45, 43, 40, 21, 19, 14]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30872035026550293 seconds
Jaccard graph constructed in 0.4299960136413574 seconds
Wrote graph to binary file in 0.024828195571899414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.859811
After 3 runs, maximum modularity is Q = 0.861928
After 8 runs, maximum modularity is Q = 0.863107
Louvain completed 28 runs in 1.71533203125 seconds
PhenoGraph complete in 2.490504026412964 seconds
Found communities [-1, ... 15], with sizes: [207, 569, 246, 138, 119, 112, 97, 64, 60, 55, 50, 49, 38, 26, 24, 17, 14]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30737853050231934 seconds
Jaccard graph constructed in 0.4416193962097168 seconds
Wrote graph to binary file in 0.023202896118164062 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869901
After 13 runs, maximum modularity is Q = 0.871082
Louvain completed 33 runs in 1.7465119361877441 seconds
PhenoGraph complete in 2.5287082195281982 seconds
Found communities [-1, ... 15], with sizes: [233, 526, 264, 240, 97, 94, 88, 73, 45, 42, 38, 38, 36, 23, 18, 17, 13]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30819153785705566 seconds
Jaccard graph constructed in 0.424422025680542 seconds
Wrote graph to binary file in 0.025684356689453125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868794
Louvain completed 21 runs in 1.0887198448181152 seconds
PhenoGraph complete in 1.8569042682647705 seconds
Found communities [-1, ... 15], with sizes: [240, 506, 265, 259, 96, 94, 72, 65, 53, 53, 40, 33, 31, 30, 20, 17, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081822395324707 seconds
Jaccard graph constructed in 0.4247720241546631 seconds
Wrote graph to binary file in 0.0247344970703125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.86858
After 2 runs, maximum modularity is Q = 0.869937
After 9 runs, maximum modularity is Q = 0.870986
Louvain completed 29 runs in 1.7439320087432861 seconds
PhenoGraph complete in 2.511955976486206 seconds
Found communities [-1, ... 15], with sizes: [268, 525, 276, 211, 147, 93, 68, 57, 55, 37, 36, 29, 26, 18, 15, 12, 12]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30760717391967773 seconds
Jaccard graph constructed in 0.41756319999694824 seconds
Wrote graph to binary file in 0.1711583137512207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870444
Louvain completed 21 runs in 1.0591626167297363 seconds
PhenoGraph complete in 1.9658491611480713 seconds
Found communities [-1, ... 18], with sizes: [254, 467, 241, 149, 140, 76, 71, 69, 67, 63, 48, 42, 38, 33, 28, 25, 22, 20, 16, 16]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3072676658630371 seconds
Jaccard graph constructed in 0.4320073127746582 seconds
Wrote graph to binary file in 0.0248870849609375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87192
Louvain completed 21 runs in 1.073777675628662 seconds
PhenoGraph complete in 1.850381851196289 seconds
Found communities [-1, ... 16], with sizes: [257, 561, 227, 194, 98, 95, 70, 67, 64, 47, 43, 35, 29, 23, 22, 19, 19, 15]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3071107864379883 seconds
Jaccard graph constructed in 0.4304664134979248 seconds
Wrote graph to binary file in 0.023406982421875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.867366
After 5 runs, maximum modularity is Q = 0.868573
Louvain completed 25 runs in 1.4344425201416016 seconds
PhenoGraph complete in 2.2068140506744385 seconds
Found communities [-1, ... 13], with sizes: [240, 509, 301, 187, 139, 102, 92, 67, 63, 46, 39, 31, 25, 24, 20]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30753302574157715 seconds
Jaccard graph constructed in 0.4122314453125 seconds
Wrote graph to binary file in 0.024931669235229492 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875892
Louvain completed 21 runs in 1.0930914878845215 seconds
PhenoGraph complete in 1.8503565788269043 seconds
Found communities [-1, ... 15], with sizes: [286, 535, 313, 151, 114, 69, 67, 63, 57, 55, 40, 36, 34, 24, 15, 13, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30872368812561035 seconds
Jaccard graph constructed in 0.4181251525878906 seconds
Wrote graph to binary file in 0.023557186126708984 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868154
Louvain completed 21 runs in 1.0970056056976318 seconds
PhenoGraph complete in 1.8579034805297852 seconds
Found communities [-1, ... 14], with sizes: [244, 504, 245, 183, 138, 118, 92, 82, 65, 55, 37, 33, 30, 24, 24, 11]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30840229988098145 seconds
Jaccard graph constructed in 0.40717005729675293 seconds
Wrote graph to binary file in 0.02349376678466797 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869651
Louvain completed 21 runs in 1.0891599655151367 seconds
PhenoGraph complete in 1.8400168418884277 seconds
Found communities [-1, ... 14], with sizes: [225, 490, 241, 225, 136, 77, 75, 67, 66, 63, 61, 60, 39, 25, 21, 14]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3091762065887451 seconds
Jaccard graph constructed in 0.4022819995880127 seconds
Wrote graph to binary file in 0.19213199615478516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87109
Louvain completed 21 runs in 1.099332571029663 seconds
PhenoGraph complete in 2.011976480484009 seconds
Found communities [-1, ... 15], with sizes: [228, 574, 293, 135, 95, 83, 77, 71, 68, 55, 42, 37, 37, 32, 24, 20, 14]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30831241607666016 seconds
Jaccard graph constructed in 0.4155745506286621 seconds
Wrote graph to binary file in 0.024890422821044922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865493
After 3 runs, maximum modularity is Q = 0.866636
Louvain completed 23 runs in 1.3287489414215088 seconds
PhenoGraph complete in 2.0908920764923096 seconds
Found communities [-1, ... 16], with sizes: [241, 491, 255, 225, 109, 100, 81, 68, 64, 51, 40, 33, 32, 27, 24, 17, 16, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.307614803314209 seconds
Jaccard graph constructed in 0.45827722549438477 seconds
Wrote graph to binary file in 0.024315595626831055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875204
Louvain completed 21 runs in 1.076829433441162 seconds
PhenoGraph complete in 1.8799591064453125 seconds
Found communities [-1, ... 15], with sizes: [253, 530, 238, 154, 107, 94, 75, 70, 66, 60, 50, 40, 34, 33, 29, 29, 23]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3083229064941406 seconds
Jaccard graph constructed in 0.37384557723999023 seconds
Wrote graph to binary file in 0.027457475662231445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876345
Louvain completed 21 runs in 1.0791652202606201 seconds
PhenoGraph complete in 1.802321195602417 seconds
Found communities [-1, ... 15], with sizes: [241, 485, 286, 273, 110, 79, 76, 68, 58, 42, 37, 30, 30, 24, 18, 17, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3073537349700928 seconds
Jaccard graph constructed in 0.3943479061126709 seconds
Wrote graph to binary file in 0.023504018783569336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872962
Louvain completed 21 runs in 1.0706963539123535 seconds
PhenoGraph complete in 1.8065199851989746 seconds
Found communities [-1, ... 14], with sizes: [277, 526, 229, 134, 118, 107, 94, 79, 64, 63, 43, 37, 36, 36, 23, 19]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30828022956848145 seconds
Jaccard graph constructed in 0.41057538986206055 seconds
Wrote graph to binary file in 0.02418375015258789 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.866867
Louvain completed 21 runs in 1.0627703666687012 seconds
PhenoGraph complete in 1.817657232284546 seconds
Found communities [-1, ... 13], with sizes: [229, 545, 264, 263, 125, 97, 67, 65, 52, 51, 35, 30, 27, 24, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30927324295043945 seconds
Jaccard graph constructed in 0.4102966785430908 seconds
Wrote graph to binary file in 0.16541123390197754 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876616
Louvain completed 21 runs in 1.1146433353424072 seconds
PhenoGraph complete in 2.0135934352874756 seconds
Found communities [-1, ... 14], with sizes: [246, 521, 252, 218, 134, 114, 60, 58, 53, 49, 46, 35, 32, 32, 24, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3083162307739258 seconds
Jaccard graph constructed in 0.4117872714996338 seconds
Wrote graph to binary file in 0.02314305305480957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868976
Louvain completed 21 runs in 1.114424228668213 seconds
PhenoGraph complete in 1.8750827312469482 seconds
Found communities [-1, ... 16], with sizes: [267, 491, 315, 156, 124, 94, 90, 65, 55, 37, 32, 31, 27, 26, 23, 22, 15, 15]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41069793701171875 seconds
Jaccard graph constructed in 0.4212212562561035 seconds
Wrote graph to binary file in 0.023562192916870117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870094
After 2 runs, maximum modularity is Q = 0.871666
Louvain completed 22 runs in 1.2846720218658447 seconds
PhenoGraph complete in 2.1531689167022705 seconds
Found communities [-1, ... 12], with sizes: [279, 507, 251, 251, 144, 76, 71, 68, 67, 57, 32, 30, 28, 24]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30826401710510254 seconds
Jaccard graph constructed in 0.40161728858947754 seconds
Wrote graph to binary file in 0.022974729537963867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869537
Louvain completed 21 runs in 1.1055395603179932 seconds
PhenoGraph complete in 1.8490526676177979 seconds
Found communities [-1, ... 13], with sizes: [259, 527, 257, 221, 137, 97, 75, 62, 58, 57, 43, 39, 25, 17, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30946779251098633 seconds
Jaccard graph constructed in 0.40761303901672363 seconds
Wrote graph to binary file in 0.02495288848876953 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.867995
After 5 runs, maximum modularity is Q = 0.869488
Louvain completed 25 runs in 1.433722972869873 seconds
PhenoGraph complete in 2.1903669834136963 seconds
Found communities [-1, ... 14], with sizes: [227, 500, 257, 168, 127, 123, 83, 71, 66, 62, 60, 41, 34, 28, 23, 15]

In [124]:
sc.pp.normalize_per_cell(D363_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Pro1) # log transform the data
D363_Biop_Pro1.raw = D363_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [125]:
D363_Biop_Pro1 = D363_Biop_Pro1[:, D363_Biop_Pro1.var['ribo_genes']]
D363_Biop_Pro1
Out[125]:
View of AnnData object with n_obs × n_vars = 1508 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [126]:
D367_Biop_Pro1 = sc.read_10x_mtx(
    './D367_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D367_Biop_Pro1.var_names_make_unique()
D367_Biop_Pro1.obs['manip'] = 'D367_Biop_Pro1'
D367_Biop_Pro1.obs['position'] = 'Proximal'
D367_Biop_Pro1.obs['method'] = 'Biopsy'
D367_Biop_Pro1.obs['donor'] = 'D367'
D367_Biop_Pro1.obs['name'] = ['D367_Biop_Pro1_' + s for s in list(D367_Biop_Pro1.obs.index)]
D367_Biop_Pro1.obs_names = D367_Biop_Pro1.obs['name']
D367_Biop_Pro1
... reading from cache file ./cache/D367_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[126]:
AnnData object with n_obs × n_vars = 3180 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [127]:
sc.pl.highest_expr_genes(D367_Biop_Pro1, n_top=20)
In [128]:
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=0)
mito_genes = D367_Biop_Pro1.var_names.str.startswith('MT-')
D367_Biop_Pro1.obs['percent_mito'] = np.sum(
    D367_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.obs['n_counts'] = D367_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Pro1.to_df())
ribo_genes = D367_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D367_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Pro1.X, axis=1).A1
D367_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D367_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [129]:
sc.pp.filter_cells(D367_Biop_Pro1, min_genes=500)
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['n_counts'] < 30000, :]
D367_Biop_Pro1 = D367_Biop_Pro1[D367_Biop_Pro1.obs['percent_mito'] < 0.4, :]
filtered out 7 cells that have less than 500 genes expressed
In [130]:
# scrublet
scrub = scr.Scrublet(D367_Biop_Pro1.X, expected_doublet_rate=0.024)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D367_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.30
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 16.0%
Overall doublet rate:
	Expected   = 2.4%
	Estimated  = 3.8%
Elapsed time: 2.4 seconds
Out[130]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb62285c0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb62ea748>],
       dtype=object))
In [131]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Biop_Pro1.X).predict()
D367_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8105325698852539 seconds
Jaccard graph constructed in 0.5572328567504883 seconds
Wrote graph to binary file in 0.05981707572937012 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920174
Louvain completed 21 runs in 1.4654748439788818 seconds
PhenoGraph complete in 2.9070277214050293 seconds
Found communities [-1, ... 21], with sizes: [194, 1216, 420, 238, 223, 212, 167, 158, 137, 115, 108, 94, 92, 85, 83, 80, 74, 53, 52, 49, 40, 35, 23]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7097203731536865 seconds
Jaccard graph constructed in 0.5629580020904541 seconds
Wrote graph to binary file in 0.25125646591186523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.919261
After 11 runs, maximum modularity is Q = 0.920274
Louvain completed 31 runs in 2.2788219451904297 seconds
PhenoGraph complete in 3.817668914794922 seconds
Found communities [-1, ... 23], with sizes: [194, 1171, 421, 266, 232, 198, 179, 154, 150, 116, 109, 92, 90, 85, 85, 83, 54, 52, 47, 41, 40, 37, 23, 16, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9105391502380371 seconds
Jaccard graph constructed in 0.5836596488952637 seconds
Wrote graph to binary file in 0.06721949577331543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.923622
Louvain completed 21 runs in 1.4665532112121582 seconds
PhenoGraph complete in 3.043471574783325 seconds
Found communities [-1, ... 23], with sizes: [195, 1131, 438, 244, 233, 228, 184, 173, 141, 123, 121, 106, 91, 79, 71, 57, 56, 55, 49, 45, 43, 37, 23, 14, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8099977970123291 seconds
Jaccard graph constructed in 0.5825245380401611 seconds
Wrote graph to binary file in 0.23905181884765625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.92033
Louvain completed 21 runs in 1.4683146476745605 seconds
PhenoGraph complete in 3.1150622367858887 seconds
Found communities [-1, ... 22], with sizes: [187, 1154, 431, 348, 237, 224, 172, 168, 140, 112, 105, 94, 86, 76, 71, 59, 54, 50, 49, 43, 36, 24, 15, 13]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8167722225189209 seconds
Jaccard graph constructed in 0.5718975067138672 seconds
Wrote graph to binary file in 0.06685614585876465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920141
Louvain completed 21 runs in 1.4451842308044434 seconds
PhenoGraph complete in 2.9165916442871094 seconds
Found communities [-1, ... 22], with sizes: [175, 1193, 421, 264, 242, 221, 175, 145, 135, 109, 107, 105, 92, 89, 81, 72, 62, 56, 51, 46, 35, 29, 24, 19]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8103835582733154 seconds
Jaccard graph constructed in 0.5667843818664551 seconds
Wrote graph to binary file in 0.21831989288330078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.92355
Louvain completed 21 runs in 1.455714464187622 seconds
PhenoGraph complete in 3.066343069076538 seconds
Found communities [-1, ... 22], with sizes: [196, 1199, 444, 239, 228, 217, 166, 148, 127, 124, 108, 97, 87, 82, 80, 74, 71, 54, 52, 45, 45, 29, 24, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8093373775482178 seconds
Jaccard graph constructed in 0.5747344493865967 seconds
Wrote graph to binary file in 0.06644582748413086 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922434
Louvain completed 21 runs in 1.509354829788208 seconds
PhenoGraph complete in 2.972931146621704 seconds
Found communities [-1, ... 22], with sizes: [189, 1165, 408, 323, 244, 170, 166, 155, 152, 121, 105, 91, 91, 88, 84, 83, 60, 55, 43, 41, 41, 35, 24, 14]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.909996747970581 seconds
Jaccard graph constructed in 0.6303787231445312 seconds
Wrote graph to binary file in 0.23954415321350098 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.921651
Louvain completed 21 runs in 1.4474704265594482 seconds
PhenoGraph complete in 3.2454562187194824 seconds
Found communities [-1, ... 21], with sizes: [184, 1208, 414, 315, 251, 217, 167, 165, 152, 114, 107, 91, 88, 85, 73, 60, 56, 48, 46, 40, 30, 24, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.810338020324707 seconds
Jaccard graph constructed in 0.5772206783294678 seconds
Wrote graph to binary file in 0.06661152839660645 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917956
After 20 runs, maximum modularity is Q = 0.918981
Louvain completed 40 runs in 2.744997024536133 seconds
PhenoGraph complete in 4.215928077697754 seconds
Found communities [-1, ... 21], with sizes: [200, 1203, 420, 267, 234, 219, 170, 167, 137, 114, 105, 92, 86, 82, 73, 64, 59, 58, 51, 48, 40, 36, 23]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9109387397766113 seconds
Jaccard graph constructed in 0.5822474956512451 seconds
Wrote graph to binary file in 0.24210119247436523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918845
Louvain completed 21 runs in 1.4738397598266602 seconds
PhenoGraph complete in 3.226494550704956 seconds
Found communities [-1, ... 21], with sizes: [193, 1237, 419, 243, 226, 223, 193, 172, 161, 136, 107, 93, 84, 69, 67, 60, 59, 51, 51, 37, 30, 23, 14]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8096990585327148 seconds
Jaccard graph constructed in 0.5590062141418457 seconds
Wrote graph to binary file in 0.06928634643554688 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922123
Louvain completed 21 runs in 1.453864336013794 seconds
PhenoGraph complete in 2.907410144805908 seconds
Found communities [-1, ... 21], with sizes: [188, 1156, 470, 432, 330, 171, 141, 135, 123, 108, 90, 87, 86, 74, 67, 67, 46, 45, 44, 29, 22, 22, 15]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9095442295074463 seconds
Jaccard graph constructed in 0.5906589031219482 seconds
Wrote graph to binary file in 0.23798322677612305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920328
Louvain completed 21 runs in 1.4951162338256836 seconds
PhenoGraph complete in 3.248774290084839 seconds
Found communities [-1, ... 21], with sizes: [185, 1191, 453, 341, 236, 234, 143, 143, 141, 118, 107, 91, 88, 78, 73, 54, 51, 47, 46, 42, 35, 34, 17]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8095653057098389 seconds
Jaccard graph constructed in 0.5797111988067627 seconds
Wrote graph to binary file in 0.06597495079040527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.921981
Louvain completed 21 runs in 1.5110702514648438 seconds
PhenoGraph complete in 2.9795186519622803 seconds
Found communities [-1, ... 22], with sizes: [140, 1230, 444, 306, 226, 223, 165, 158, 147, 126, 111, 93, 90, 81, 79, 62, 53, 45, 42, 39, 35, 23, 17, 13]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.809781551361084 seconds
Jaccard graph constructed in 0.7734463214874268 seconds
Wrote graph to binary file in 0.065277099609375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.921835
Louvain completed 21 runs in 1.4789063930511475 seconds
PhenoGraph complete in 3.143568754196167 seconds
Found communities [-1, ... 23], with sizes: [192, 1196, 402, 332, 248, 214, 162, 144, 141, 133, 106, 93, 90, 81, 72, 56, 49, 47, 45, 39, 36, 22, 20, 17, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8100326061248779 seconds
Jaccard graph constructed in 0.5675520896911621 seconds
Wrote graph to binary file in 0.21927332878112793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.917905
After 7 runs, maximum modularity is Q = 0.91893
Louvain completed 27 runs in 2.031409978866577 seconds
PhenoGraph complete in 3.6419076919555664 seconds
Found communities [-1, ... 22], with sizes: [204, 1191, 412, 336, 214, 203, 197, 170, 127, 108, 106, 92, 86, 83, 79, 52, 51, 44, 43, 43, 37, 35, 24, 11]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8096253871917725 seconds
Jaccard graph constructed in 0.5573554039001465 seconds
Wrote graph to binary file in 0.06541252136230469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.92422
Louvain completed 21 runs in 1.4452247619628906 seconds
PhenoGraph complete in 2.8921914100646973 seconds
Found communities [-1, ... 23], with sizes: [193, 1171, 440, 334, 248, 220, 154, 151, 143, 113, 105, 93, 91, 76, 74, 62, 58, 51, 41, 34, 32, 23, 18, 12, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8086884021759033 seconds
Jaccard graph constructed in 0.5715727806091309 seconds
Wrote graph to binary file in 0.06446361541748047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922189
Louvain completed 21 runs in 1.492135763168335 seconds
PhenoGraph complete in 2.94960618019104 seconds
Found communities [-1, ... 24], with sizes: [180, 1212, 435, 296, 240, 174, 150, 143, 131, 121, 106, 93, 87, 85, 84, 82, 62, 52, 45, 42, 40, 28, 23, 13, 12, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8086469173431396 seconds
Jaccard graph constructed in 0.7175347805023193 seconds
Wrote graph to binary file in 0.0649256706237793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.924324
After 4 runs, maximum modularity is Q = 0.925447
Louvain completed 24 runs in 1.8387408256530762 seconds
PhenoGraph complete in 3.444546937942505 seconds
Found communities [-1, ... 22], with sizes: [160, 1243, 446, 294, 234, 227, 162, 155, 131, 117, 108, 91, 89, 80, 80, 61, 57, 54, 41, 34, 29, 23, 19, 13]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8097279071807861 seconds
Jaccard graph constructed in 0.5598583221435547 seconds
Wrote graph to binary file in 0.22486257553100586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918037
Louvain completed 21 runs in 1.5992670059204102 seconds
PhenoGraph complete in 3.2090132236480713 seconds
Found communities [-1, ... 22], with sizes: [185, 1207, 432, 257, 257, 231, 148, 137, 137, 127, 107, 93, 89, 81, 78, 67, 67, 54, 47, 41, 39, 34, 22, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.810072660446167 seconds
Jaccard graph constructed in 0.5778443813323975 seconds
Wrote graph to binary file in 0.06517863273620605 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922872
Louvain completed 21 runs in 1.4541747570037842 seconds
PhenoGraph complete in 2.9231276512145996 seconds
Found communities [-1, ... 25], with sizes: [178, 1192, 426, 234, 223, 199, 156, 155, 146, 118, 108, 105, 93, 85, 84, 72, 70, 60, 47, 44, 39, 28, 23, 22, 17, 13, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9112532138824463 seconds
Jaccard graph constructed in 0.5815844535827637 seconds
Wrote graph to binary file in 0.06464576721191406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.920535
After 5 runs, maximum modularity is Q = 0.921564
Louvain completed 25 runs in 1.9627289772033691 seconds
PhenoGraph complete in 3.5345304012298584 seconds
Found communities [-1, ... 23], with sizes: [179, 1192, 430, 255, 246, 241, 164, 143, 142, 124, 107, 91, 84, 77, 72, 72, 56, 53, 46, 43, 41, 35, 23, 17, 15]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8093485832214355 seconds
Jaccard graph constructed in 0.7571942806243896 seconds
Wrote graph to binary file in 0.06494641304016113 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.918662
Louvain completed 21 runs in 1.488870620727539 seconds
PhenoGraph complete in 3.135673761367798 seconds
Found communities [-1, ... 21], with sizes: [220, 1199, 429, 308, 252, 222, 153, 148, 136, 121, 107, 92, 83, 81, 79, 70, 58, 40, 40, 38, 38, 23, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9098646640777588 seconds
Jaccard graph constructed in 0.5914657115936279 seconds
Wrote graph to binary file in 0.22304725646972656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922815
After 4 runs, maximum modularity is Q = 0.923905
Louvain completed 24 runs in 1.8239877223968506 seconds
PhenoGraph complete in 3.5645148754119873 seconds
Found communities [-1, ... 22], with sizes: [195, 1211, 411, 249, 227, 225, 173, 153, 141, 120, 105, 93, 92, 79, 79, 73, 69, 49, 45, 43, 40, 39, 23, 14]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9108030796051025 seconds
Jaccard graph constructed in 0.5746753215789795 seconds
Wrote graph to binary file in 0.06561899185180664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.923058
Louvain completed 21 runs in 1.4927396774291992 seconds
PhenoGraph complete in 3.0595078468322754 seconds
Found communities [-1, ... 22], with sizes: [203, 1185, 453, 304, 278, 166, 162, 146, 143, 131, 106, 90, 81, 73, 68, 67, 64, 56, 46, 40, 34, 23, 18, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9106283187866211 seconds
Jaccard graph constructed in 0.5874824523925781 seconds
Wrote graph to binary file in 0.06527256965637207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.922426
Louvain completed 21 runs in 1.42024564743042 seconds
PhenoGraph complete in 2.999218702316284 seconds
Found communities [-1, ... 22], with sizes: [217, 1200, 409, 256, 225, 215, 168, 161, 136, 116, 111, 92, 91, 88, 76, 74, 54, 50, 50, 47, 40, 38, 23, 11]

In [132]:
sc.pp.normalize_per_cell(D367_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Pro1) # log transform the data
D367_Biop_Pro1.raw = D367_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [133]:
D367_Biop_Pro1 = D367_Biop_Pro1[:, D367_Biop_Pro1.var['ribo_genes']]
D367_Biop_Pro1
Out[133]:
View of AnnData object with n_obs × n_vars = 3159 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [134]:
D372_Biop_Pro1 = sc.read_10x_mtx(
    './D372_Biop_Pro1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D372_Biop_Pro1.var_names_make_unique()
D372_Biop_Pro1.obs['manip'] = 'D372_Biop_Pro1'
D372_Biop_Pro1.obs['position'] = 'Proximal'
D372_Biop_Pro1.obs['method'] = 'Biopsy'
D372_Biop_Pro1.obs['donor'] = 'D372'
D372_Biop_Pro1.obs['name'] = ['D372_Biop_Pro1_' + s for s in list(D372_Biop_Pro1.obs.index)]
D372_Biop_Pro1.obs_names = D372_Biop_Pro1.obs['name']
D372_Biop_Pro1
... reading from cache file ./cache/D372_Biop_Pro1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[134]:
AnnData object with n_obs × n_vars = 4585 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [135]:
sc.pl.highest_expr_genes(D372_Biop_Pro1, n_top=20)
In [136]:
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=0)
mito_genes = D372_Biop_Pro1.var_names.str.startswith('MT-')
D372_Biop_Pro1.obs['percent_mito'] = np.sum(
    D372_Biop_Pro1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.obs['n_counts'] = D372_Biop_Pro1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Pro1.to_df())
ribo_genes = D372_Biop_Pro1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Pro1.obs['percent_ribo'] = np.sum(
    D372_Biop_Pro1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Pro1.X, axis=1).A1
D372_Biop_Pro1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D372_Biop_Pro1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [137]:
sc.pp.filter_cells(D372_Biop_Pro1, min_genes=500)
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['n_counts'] < 30000, :]
D372_Biop_Pro1 = D372_Biop_Pro1[D372_Biop_Pro1.obs['percent_mito'] < 0.3, :]
filtered out 4 cells that have less than 500 genes expressed
In [138]:
# scrublet
scrub = scr.Scrublet(D372_Biop_Pro1.X, expected_doublet_rate=0.038)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Pro1.obs['doublet_scores'] = doublet_scores
D372_Biop_Pro1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/scrublet/helper_functions.py:238: RuntimeWarning: invalid value encountered in log
  gLog = lambda input: np.log(input[1] * np.exp(-input[0]) + input[2])
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.45
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 7.3%
Overall doublet rate:
	Expected   = 3.8%
	Estimated  = 4.2%
Elapsed time: 4.0 seconds
Out[138]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb663b198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb46e2ac8>],
       dtype=object))
In [139]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Pro1.X).predict()
D372_Biop_Pro1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9112594127655029 seconds
Jaccard graph constructed in 0.7462267875671387 seconds
Wrote graph to binary file in 0.28792905807495117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.928885
Louvain completed 21 runs in 1.7388286590576172 seconds
PhenoGraph complete in 3.702601909637451 seconds
Found communities [-1, ... 23], with sizes: [238, 1608, 936, 499, 469, 425, 221, 191, 121, 120, 109, 105, 100, 92, 78, 66, 64, 54, 50, 41, 39, 34, 25, 21, 11]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6132426261901855 seconds
Jaccard graph constructed in 0.7779467105865479 seconds
Wrote graph to binary file in 0.28315186500549316 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.930108
Louvain completed 21 runs in 1.9567527770996094 seconds
PhenoGraph complete in 4.653115749359131 seconds
Found communities [-1, ... 22], with sizes: [233, 1628, 953, 491, 481, 459, 216, 167, 126, 123, 104, 102, 93, 85, 82, 69, 64, 50, 48, 43, 34, 31, 21, 14]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1129202842712402 seconds
Jaccard graph constructed in 0.7580430507659912 seconds
Wrote graph to binary file in 0.09833741188049316 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.924756
Louvain completed 21 runs in 1.9004945755004883 seconds
PhenoGraph complete in 3.8885812759399414 seconds
Found communities [-1, ... 24], with sizes: [240, 1644, 921, 509, 436, 430, 223, 196, 127, 125, 110, 102, 95, 90, 89, 65, 64, 50, 41, 31, 30, 27, 22, 22, 17, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.7137095928192139 seconds
Jaccard graph constructed in 1.0560684204101562 seconds
Wrote graph to binary file in 0.09852480888366699 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.929801
Louvain completed 21 runs in 1.9685957431793213 seconds
PhenoGraph complete in 4.861473083496094 seconds
Found communities [-1, ... 25], with sizes: [192, 1539, 841, 543, 462, 411, 206, 206, 161, 138, 131, 106, 103, 97, 94, 69, 68, 65, 59, 42, 35, 33, 32, 29, 23, 20, 12]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6131210327148438 seconds
Jaccard graph constructed in 0.7775664329528809 seconds
Wrote graph to binary file in 0.28657078742980957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927946
Louvain completed 21 runs in 1.811650276184082 seconds
PhenoGraph complete in 4.507956266403198 seconds
Found communities [-1, ... 23], with sizes: [218, 1669, 832, 537, 443, 438, 232, 167, 148, 106, 106, 101, 98, 84, 80, 75, 67, 66, 50, 47, 41, 34, 33, 32, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6162004470825195 seconds
Jaccard graph constructed in 0.7626640796661377 seconds
Wrote graph to binary file in 0.30016422271728516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927583
Louvain completed 21 runs in 1.8646981716156006 seconds
PhenoGraph complete in 4.562420606613159 seconds
Found communities [-1, ... 22], with sizes: [225, 1552, 959, 539, 503, 408, 230, 169, 132, 110, 109, 104, 103, 102, 94, 65, 64, 53, 48, 41, 36, 35, 22, 14]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.816399097442627 seconds
Jaccard graph constructed in 0.7958872318267822 seconds
Wrote graph to binary file in 0.09909796714782715 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927359
Louvain completed 21 runs in 1.8575026988983154 seconds
PhenoGraph complete in 4.594034671783447 seconds
Found communities [-1, ... 23], with sizes: [218, 1611, 970, 495, 469, 411, 222, 209, 124, 111, 106, 104, 97, 97, 89, 65, 63, 57, 44, 41, 35, 24, 22, 20, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.815995216369629 seconds
Jaccard graph constructed in 0.762723445892334 seconds
Wrote graph to binary file in 0.2827150821685791 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.930099
Louvain completed 21 runs in 1.9286956787109375 seconds
PhenoGraph complete in 4.808043003082275 seconds
Found communities [-1, ... 24], with sizes: [226, 1632, 903, 521, 460, 444, 214, 159, 122, 120, 114, 106, 102, 94, 88, 71, 65, 62, 41, 38, 33, 24, 24, 22, 21, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.5132901668548584 seconds
Jaccard graph constructed in 0.7941238880157471 seconds
Wrote graph to binary file in 0.2744612693786621 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.926996
Louvain completed 21 runs in 1.96500825881958 seconds
PhenoGraph complete in 4.569213628768921 seconds
Found communities [-1, ... 25], with sizes: [241, 1566, 864, 541, 467, 418, 237, 198, 131, 116, 109, 106, 100, 86, 78, 66, 64, 63, 50, 42, 37, 31, 28, 23, 23, 20, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.511993646621704 seconds
Jaccard graph constructed in 0.7374269962310791 seconds
Wrote graph to binary file in 0.09723782539367676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.9266
Louvain completed 21 runs in 1.897585391998291 seconds
PhenoGraph complete in 4.262134790420532 seconds
Found communities [-1, ... 23], with sizes: [201, 1660, 897, 498, 495, 436, 221, 179, 127, 113, 108, 107, 104, 103, 79, 69, 64, 50, 50, 42, 37, 31, 21, 13, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6146581172943115 seconds
Jaccard graph constructed in 0.756951093673706 seconds
Wrote graph to binary file in 0.0980679988861084 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.929866
Louvain completed 21 runs in 1.9732599258422852 seconds
PhenoGraph complete in 4.465601205825806 seconds
Found communities [-1, ... 26], with sizes: [222, 1672, 780, 491, 468, 419, 220, 152, 128, 118, 118, 104, 102, 98, 90, 76, 71, 67, 59, 54, 43, 32, 31, 31, 23, 20, 17, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6129565238952637 seconds
Jaccard graph constructed in 0.747774600982666 seconds
Wrote graph to binary file in 0.2811613082885742 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.928524
Louvain completed 21 runs in 1.9010710716247559 seconds
PhenoGraph complete in 4.561993837356567 seconds
Found communities [-1, ... 25], with sizes: [197, 1595, 871, 588, 467, 413, 219, 142, 131, 128, 122, 105, 104, 93, 85, 67, 67, 53, 49, 42, 38, 36, 30, 23, 22, 17, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6130390167236328 seconds
Jaccard graph constructed in 0.7356076240539551 seconds
Wrote graph to binary file in 0.2615077495574951 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.928517
Louvain completed 21 runs in 1.8789329528808594 seconds
PhenoGraph complete in 4.506845712661743 seconds
Found communities [-1, ... 22], with sizes: [236, 1633, 883, 538, 464, 426, 221, 193, 129, 114, 110, 105, 100, 87, 78, 72, 65, 56, 50, 41, 37, 33, 26, 20]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6142187118530273 seconds
Jaccard graph constructed in 0.7434346675872803 seconds
Wrote graph to binary file in 0.09998083114624023 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.928991
Louvain completed 21 runs in 1.9372618198394775 seconds
PhenoGraph complete in 4.413788080215454 seconds
Found communities [-1, ... 23], with sizes: [190, 1597, 912, 539, 509, 405, 214, 186, 131, 122, 113, 104, 102, 94, 88, 71, 67, 53, 47, 43, 40, 31, 29, 17, 13]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1131341457366943 seconds
Jaccard graph constructed in 1.018934726715088 seconds
Wrote graph to binary file in 0.11078023910522461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.931325
Louvain completed 21 runs in 2.0481789112091064 seconds
PhenoGraph complete in 4.313236474990845 seconds
Found communities [-1, ... 23], with sizes: [213, 1547, 936, 533, 454, 445, 222, 217, 127, 119, 109, 107, 103, 95, 84, 65, 65, 58, 41, 39, 35, 34, 29, 23, 17]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.1158053874969482 seconds
Jaccard graph constructed in 0.7455377578735352 seconds
Wrote graph to binary file in 0.2893695831298828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927404
Louvain completed 21 runs in 1.8686316013336182 seconds
PhenoGraph complete in 4.04028582572937 seconds
Found communities [-1, ... 24], with sizes: [217, 1606, 912, 539, 506, 415, 206, 195, 112, 112, 108, 104, 97, 90, 85, 68, 65, 56, 43, 35, 33, 32, 31, 23, 16, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.619184970855713 seconds
Jaccard graph constructed in 0.7403817176818848 seconds
Wrote graph to binary file in 0.2953202724456787 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927235
Louvain completed 21 runs in 1.9175662994384766 seconds
PhenoGraph complete in 4.594003200531006 seconds
Found communities [-1, ... 24], with sizes: [236, 1644, 928, 492, 477, 431, 219, 155, 123, 112, 104, 104, 100, 100, 88, 75, 68, 47, 40, 40, 34, 27, 23, 22, 17, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6144399642944336 seconds
Jaccard graph constructed in 0.7484476566314697 seconds
Wrote graph to binary file in 0.09547019004821777 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.928154
After 8 runs, maximum modularity is Q = 0.929179
Louvain completed 28 runs in 2.6022768020629883 seconds
PhenoGraph complete in 5.083362340927124 seconds
Found communities [-1, ... 24], with sizes: [215, 1648, 809, 514, 457, 443, 250, 160, 131, 129, 110, 104, 92, 90, 84, 76, 66, 65, 54, 46, 42, 37, 32, 26, 22, 15]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.5127384662628174 seconds
Jaccard graph constructed in 0.7312557697296143 seconds
Wrote graph to binary file in 0.27966833114624023 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.930157
Louvain completed 21 runs in 1.9421899318695068 seconds
PhenoGraph complete in 4.4853222370147705 seconds
Found communities [-1, ... 24], with sizes: [233, 1604, 842, 501, 479, 411, 217, 212, 127, 117, 111, 111, 103, 98, 96, 74, 67, 61, 52, 42, 37, 33, 25, 23, 22, 19]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6139726638793945 seconds
Jaccard graph constructed in 0.7577183246612549 seconds
Wrote graph to binary file in 0.2873423099517822 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927316
Louvain completed 21 runs in 1.8524832725524902 seconds
PhenoGraph complete in 4.533081531524658 seconds
Found communities [-1, ... 23], with sizes: [211, 1728, 812, 493, 470, 429, 234, 162, 140, 131, 113, 107, 105, 100, 82, 66, 66, 58, 49, 41, 34, 32, 23, 19, 12]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6131353378295898 seconds
Jaccard graph constructed in 0.73968505859375 seconds
Wrote graph to binary file in 0.09642791748046875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.926533
Louvain completed 21 runs in 1.874847412109375 seconds
PhenoGraph complete in 4.343479633331299 seconds
Found communities [-1, ... 24], with sizes: [237, 1611, 901, 518, 458, 435, 214, 200, 133, 113, 108, 104, 102, 93, 87, 67, 67, 56, 42, 41, 31, 29, 22, 21, 15, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.614177942276001 seconds
Jaccard graph constructed in 0.7559611797332764 seconds
Wrote graph to binary file in 0.2796041965484619 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.926431
Louvain completed 21 runs in 1.8801405429840088 seconds
PhenoGraph complete in 4.560997247695923 seconds
Found communities [-1, ... 24], with sizes: [202, 1700, 824, 503, 462, 426, 218, 188, 122, 120, 108, 107, 102, 90, 84, 77, 67, 64, 51, 44, 42, 38, 32, 19, 16, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.5149061679840088 seconds
Jaccard graph constructed in 0.7289752960205078 seconds
Wrote graph to binary file in 0.0992286205291748 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.930077
Louvain completed 21 runs in 1.957927942276001 seconds
PhenoGraph complete in 4.319796562194824 seconds
Found communities [-1, ... 25], with sizes: [204, 1611, 781, 530, 490, 430, 228, 185, 171, 120, 112, 110, 105, 104, 97, 67, 65, 49, 42, 42, 34, 34, 31, 25, 22, 15, 13]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.6140358448028564 seconds
Jaccard graph constructed in 0.7665765285491943 seconds
Wrote graph to binary file in 0.281707763671875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927381
Louvain completed 21 runs in 1.8730874061584473 seconds
PhenoGraph complete in 4.557349443435669 seconds
Found communities [-1, ... 24], with sizes: [223, 1607, 881, 503, 460, 425, 222, 217, 155, 132, 103, 101, 95, 93, 81, 70, 65, 53, 51, 43, 34, 32, 23, 19, 16, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.614048719406128 seconds
Jaccard graph constructed in 0.7313416004180908 seconds
Wrote graph to binary file in 0.28035402297973633 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.927283
Louvain completed 21 runs in 1.9604082107543945 seconds
PhenoGraph complete in 4.606944561004639 seconds
Found communities [-1, ... 23], with sizes: [199, 1592, 967, 497, 480, 457, 226, 168, 122, 108, 107, 105, 104, 103, 99, 65, 65, 50, 42, 39, 32, 27, 23, 22, 18]

In [140]:
sc.pp.normalize_per_cell(D372_Biop_Pro1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Pro1) # log transform the data
D372_Biop_Pro1.raw = D372_Biop_Pro1 # freeze the object (for later use of the raw state of it)
In [141]:
D372_Biop_Pro1 = D372_Biop_Pro1[:, D372_Biop_Pro1.var['ribo_genes']]
D372_Biop_Pro1
Out[141]:
View of AnnData object with n_obs × n_vars = 4574 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

Intermediate Biopsies

Back to top

In [142]:
D322_Biop_Int1 = sc.read_10x_mtx(
    './D322_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D322_Biop_Int1.var_names_make_unique()
D322_Biop_Int1.obs['manip'] = 'D322_Biop_Int1'
D322_Biop_Int1.obs['position'] = 'Intermediate'
D322_Biop_Int1.obs['method'] = 'Biopsy'
D322_Biop_Int1.obs['donor'] = 'D322'
D322_Biop_Int1.obs['name'] = ['D322_Biop_Int1_' + s for s in list(D322_Biop_Int1.obs.index)]
D322_Biop_Int1.obs_names = D322_Biop_Int1.obs['name']
D322_Biop_Int1
... reading from cache file ./cache/D322_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[142]:
AnnData object with n_obs × n_vars = 1923 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [143]:
sc.pl.highest_expr_genes(D322_Biop_Int1, n_top=20)
In [144]:
sc.pp.filter_cells(D322_Biop_Int1, min_genes=0)
mito_genes = D322_Biop_Int1.var_names.str.startswith('MT-')
D322_Biop_Int1.obs['percent_mito'] = np.sum(
    D322_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.obs['n_counts'] = D322_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D322_Biop_Int1.to_df())
ribo_genes = D322_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D322_Biop_Int1.obs['percent_ribo'] = np.sum(
    D322_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D322_Biop_Int1.X, axis=1).A1
D322_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D322_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [145]:
sc.pp.filter_cells(D322_Biop_Int1, min_genes=500)
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['n_counts'] < 20000, :]
D322_Biop_Int1 = D322_Biop_Int1[D322_Biop_Int1.obs['percent_mito'] < 0.2 , :]
filtered out 64 cells that have less than 500 genes expressed
In [146]:
# scrublet
scrub = scr.Scrublet(D322_Biop_Int1.X, expected_doublet_rate=0.016)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D322_Biop_Int1.obs['doublet_scores'] = doublet_scores
D322_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.18
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 16.4%
Overall doublet rate:
	Expected   = 1.6%
	Estimated  = 2.6%
Elapsed time: 0.9 seconds
Out[146]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb5251908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea99d1ac8>],
       dtype=object))
In [147]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D322_Biop_Int1.X).predict()
D322_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4068300724029541 seconds
Jaccard graph constructed in 0.4693014621734619 seconds
Wrote graph to binary file in 0.031632184982299805 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87247
After 12 runs, maximum modularity is Q = 0.873539
Louvain completed 32 runs in 1.9383304119110107 seconds
PhenoGraph complete in 2.8590123653411865 seconds
Found communities [-1, ... 16], with sizes: [201, 587, 261, 196, 170, 154, 117, 111, 88, 75, 68, 59, 55, 53, 49, 32, 23, 22]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40746402740478516 seconds
Jaccard graph constructed in 0.44730067253112793 seconds
Wrote graph to binary file in 0.2316131591796875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868111
Louvain completed 21 runs in 1.2237942218780518 seconds
PhenoGraph complete in 2.320791244506836 seconds
Found communities [-1, ... 18], with sizes: [211, 752, 264, 146, 128, 122, 100, 93, 80, 76, 72, 50, 48, 45, 36, 29, 25, 19, 13, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20693516731262207 seconds
Jaccard graph constructed in 0.4806842803955078 seconds
Wrote graph to binary file in 0.04390215873718262 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871274
Louvain completed 21 runs in 1.2296905517578125 seconds
PhenoGraph complete in 1.9819493293762207 seconds
Found communities [-1, ... 15], with sizes: [239, 467, 265, 259, 197, 168, 104, 95, 94, 93, 88, 64, 50, 47, 39, 31, 21]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4074211120605469 seconds
Jaccard graph constructed in 0.5327847003936768 seconds
Wrote graph to binary file in 0.03722238540649414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.862273
After 6 runs, maximum modularity is Q = 0.864137
Louvain completed 26 runs in 1.7315800189971924 seconds
PhenoGraph complete in 2.7248294353485107 seconds
Found communities [-1, ... 18], with sizes: [164, 465, 272, 201, 184, 178, 115, 110, 106, 94, 87, 72, 66, 48, 45, 45, 20, 20, 17, 12]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40697407722473145 seconds
Jaccard graph constructed in 0.5319037437438965 seconds
Wrote graph to binary file in 0.03862738609313965 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869802
Louvain completed 21 runs in 1.2083230018615723 seconds
PhenoGraph complete in 2.198496103286743 seconds
Found communities [-1, ... 16], with sizes: [208, 686, 284, 225, 162, 139, 109, 86, 74, 63, 56, 53, 50, 46, 32, 20, 17, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20621824264526367 seconds
Jaccard graph constructed in 0.5238604545593262 seconds
Wrote graph to binary file in 0.22135210037231445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869009
After 17 runs, maximum modularity is Q = 0.870246
Louvain completed 37 runs in 2.1698074340820312 seconds
PhenoGraph complete in 3.132782220840454 seconds
Found communities [-1, ... 15], with sizes: [211, 716, 249, 184, 175, 110, 102, 94, 88, 73, 71, 59, 57, 45, 45, 21, 21]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4072568416595459 seconds
Jaccard graph constructed in 0.49279093742370605 seconds
Wrote graph to binary file in 0.047395944595336914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870167
After 11 runs, maximum modularity is Q = 0.871219
Louvain completed 31 runs in 1.9473226070404053 seconds
PhenoGraph complete in 2.9103283882141113 seconds
Found communities [-1, ... 17], with sizes: [161, 706, 253, 210, 200, 126, 96, 87, 77, 72, 64, 58, 51, 48, 45, 22, 19, 14, 12]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20772147178649902 seconds
Jaccard graph constructed in 0.47537732124328613 seconds
Wrote graph to binary file in 0.05222797393798828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868438
Louvain completed 21 runs in 1.2413804531097412 seconds
PhenoGraph complete in 1.9910881519317627 seconds
Found communities [-1, ... 14], with sizes: [217, 404, 396, 257, 203, 176, 105, 96, 91, 84, 62, 56, 52, 50, 48, 24]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40869641304016113 seconds
Jaccard graph constructed in 0.47931933403015137 seconds
Wrote graph to binary file in 0.04055356979370117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873188
Louvain completed 21 runs in 1.2273988723754883 seconds
PhenoGraph complete in 2.1766934394836426 seconds
Found communities [-1, ... 17], with sizes: [209, 463, 281, 224, 186, 126, 107, 106, 99, 87, 77, 62, 61, 56, 48, 46, 44, 26, 13]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4078657627105713 seconds
Jaccard graph constructed in 0.48507189750671387 seconds
Wrote graph to binary file in 0.2467958927154541 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872083
Louvain completed 21 runs in 1.2212657928466797 seconds
PhenoGraph complete in 2.375140428543091 seconds
Found communities [-1, ... 16], with sizes: [167, 492, 283, 252, 201, 167, 131, 91, 88, 81, 64, 59, 55, 54, 49, 47, 29, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2082667350769043 seconds
Jaccard graph constructed in 0.5225830078125 seconds
Wrote graph to binary file in 0.036817073822021484 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870846
Louvain completed 21 runs in 1.2443389892578125 seconds
PhenoGraph complete in 2.023336172103882 seconds
Found communities [-1, ... 18], with sizes: [176, 522, 263, 223, 168, 166, 114, 97, 93, 90, 83, 66, 57, 50, 50, 40, 23, 17, 12, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40749216079711914 seconds
Jaccard graph constructed in 0.4809436798095703 seconds
Wrote graph to binary file in 0.05436515808105469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872696
Louvain completed 21 runs in 1.2257423400878906 seconds
PhenoGraph complete in 2.1857211589813232 seconds
Found communities [-1, ... 15], with sizes: [177, 683, 258, 247, 211, 144, 92, 89, 83, 71, 50, 48, 43, 43, 41, 22, 19]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20770573616027832 seconds
Jaccard graph constructed in 0.549626350402832 seconds
Wrote graph to binary file in 0.03781294822692871 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873802
After 3 runs, maximum modularity is Q = 0.875393
Louvain completed 23 runs in 1.5214858055114746 seconds
PhenoGraph complete in 2.3344552516937256 seconds
Found communities [-1, ... 15], with sizes: [214, 714, 266, 213, 178, 102, 97, 93, 89, 69, 61, 59, 47, 46, 38, 19, 16]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40694093704223633 seconds
Jaccard graph constructed in 0.5209157466888428 seconds
Wrote graph to binary file in 0.03594565391540527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871577
Louvain completed 21 runs in 1.2233171463012695 seconds
PhenoGraph complete in 2.200096607208252 seconds
Found communities [-1, ... 15], with sizes: [207, 762, 250, 182, 179, 151, 87, 85, 63, 62, 61, 53, 50, 47, 35, 24, 23]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20731258392333984 seconds
Jaccard graph constructed in 0.561589241027832 seconds
Wrote graph to binary file in 0.28632497787475586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87727
Louvain completed 21 runs in 1.36537504196167 seconds
PhenoGraph complete in 2.4366579055786133 seconds
Found communities [-1, ... 18], with sizes: [174, 537, 258, 211, 187, 178, 110, 101, 91, 90, 80, 62, 61, 45, 44, 32, 24, 13, 12, 11]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41071152687072754 seconds
Jaccard graph constructed in 0.5380644798278809 seconds
Wrote graph to binary file in 0.037424564361572266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868076
After 6 runs, maximum modularity is Q = 0.869086
Louvain completed 26 runs in 1.6808693408966064 seconds
PhenoGraph complete in 2.6815407276153564 seconds
Found communities [-1, ... 17], with sizes: [173, 588, 265, 197, 188, 170, 110, 109, 89, 86, 71, 65, 50, 45, 44, 26, 22, 12, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4077441692352295 seconds
Jaccard graph constructed in 0.47994494438171387 seconds
Wrote graph to binary file in 0.039659738540649414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878816
Louvain completed 21 runs in 1.2495367527008057 seconds
PhenoGraph complete in 2.202623128890991 seconds
Found communities [-1, ... 17], with sizes: [192, 518, 232, 198, 182, 172, 124, 117, 111, 83, 78, 70, 56, 54, 46, 36, 24, 17, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20657849311828613 seconds
Jaccard graph constructed in 0.47501611709594727 seconds
Wrote graph to binary file in 0.03810620307922363 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868923
After 5 runs, maximum modularity is Q = 0.870273
Louvain completed 25 runs in 1.6174962520599365 seconds
PhenoGraph complete in 2.376770496368408 seconds
Found communities [-1, ... 17], with sizes: [171, 558, 277, 203, 177, 174, 172, 93, 83, 71, 58, 51, 49, 47, 45, 45, 21, 14, 12]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40731334686279297 seconds
Jaccard graph constructed in 0.47495150566101074 seconds
Wrote graph to binary file in 0.039067745208740234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868957
Louvain completed 21 runs in 1.277984857559204 seconds
PhenoGraph complete in 2.2233362197875977 seconds
Found communities [-1, ... 16], with sizes: [192, 713, 325, 217, 116, 114, 86, 79, 70, 68, 63, 55, 47, 47, 46, 46, 22, 15]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20666027069091797 seconds
Jaccard graph constructed in 0.46600866317749023 seconds
Wrote graph to binary file in 0.27489566802978516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870521
After 2 runs, maximum modularity is Q = 0.871618
Louvain completed 22 runs in 1.5180156230926514 seconds
PhenoGraph complete in 2.479268789291382 seconds
Found communities [-1, ... 17], with sizes: [154, 770, 228, 198, 165, 140, 94, 86, 77, 69, 65, 59, 54, 52, 45, 23, 20, 11, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4068617820739746 seconds
Jaccard graph constructed in 0.4757375717163086 seconds
Wrote graph to binary file in 0.05712580680847168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872589
After 6 runs, maximum modularity is Q = 0.874023
Louvain completed 26 runs in 1.664569616317749 seconds
PhenoGraph complete in 2.617093086242676 seconds
Found communities [-1, ... 19], with sizes: [190, 606, 239, 235, 108, 107, 102, 96, 94, 89, 78, 75, 57, 56, 48, 46, 35, 22, 14, 12, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2073688507080078 seconds
Jaccard graph constructed in 0.5114006996154785 seconds
Wrote graph to binary file in 0.0673227310180664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868407
After 3 runs, maximum modularity is Q = 0.87025
Louvain completed 23 runs in 1.7132079601287842 seconds
PhenoGraph complete in 2.5164921283721924 seconds
Found communities [-1, ... 16], with sizes: [207, 566, 238, 203, 201, 144, 117, 109, 92, 88, 65, 62, 54, 51, 49, 43, 21, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20794010162353516 seconds
Jaccard graph constructed in 0.5248119831085205 seconds
Wrote graph to binary file in 0.03822636604309082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872533
Louvain completed 21 runs in 1.2348878383636475 seconds
PhenoGraph complete in 2.017181396484375 seconds
Found communities [-1, ... 15], with sizes: [228, 729, 339, 172, 106, 105, 99, 90, 84, 83, 63, 58, 45, 45, 43, 21, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4074375629425049 seconds
Jaccard graph constructed in 0.4670424461364746 seconds
Wrote graph to binary file in 0.05050349235534668 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873935
Louvain completed 21 runs in 1.2327196598052979 seconds
PhenoGraph complete in 2.1718688011169434 seconds
Found communities [-1, ... 15], with sizes: [207, 730, 296, 203, 170, 135, 88, 84, 82, 67, 61, 49, 45, 39, 33, 20, 12]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30725574493408203 seconds
Jaccard graph constructed in 0.46344733238220215 seconds
Wrote graph to binary file in 0.06349396705627441 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873975
Louvain completed 21 runs in 1.2412848472595215 seconds
PhenoGraph complete in 2.0901646614074707 seconds
Found communities [-1, ... 19], with sizes: [200, 493, 258, 213, 188, 185, 136, 97, 88, 78, 62, 61, 54, 45, 43, 43, 23, 14, 14, 14, 12]

In [148]:
sc.pp.normalize_per_cell(D322_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D322_Biop_Int1) # log transform the data
D322_Biop_Int1.raw = D322_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [149]:
D322_Biop_Int1 = D322_Biop_Int1[:, D322_Biop_Int1.var['ribo_genes']]
D322_Biop_Int1
Out[149]:
View of AnnData object with n_obs × n_vars = 1857 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [150]:
D326_Biop_Int1 = sc.read_10x_mtx(
    './D326_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D326_Biop_Int1.var_names_make_unique()
D326_Biop_Int1.obs['manip'] = 'D326_Biop_Int1'
D326_Biop_Int1.obs['position'] = 'Intermediate'
D326_Biop_Int1.obs['method'] = 'Biopsy'
D326_Biop_Int1.obs['donor'] = 'D326'
D326_Biop_Int1.obs['name'] = ['D326_Biop_Int1_' + s for s in list(D326_Biop_Int1.obs.index)]
D326_Biop_Int1.obs_names = D326_Biop_Int1.obs['name']
D326_Biop_Int1
... reading from cache file ./cache/D326_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[150]:
AnnData object with n_obs × n_vars = 1248 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [151]:
sc.pl.highest_expr_genes(D326_Biop_Int1, n_top=20)
In [152]:
sc.pp.filter_cells(D326_Biop_Int1, min_genes=0)
mito_genes = D326_Biop_Int1.var_names.str.startswith('MT-')
D326_Biop_Int1.obs['percent_mito'] = np.sum(
    D326_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.obs['n_counts'] = D326_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Biop_Int1.to_df())
ribo_genes = D326_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D326_Biop_Int1.obs['percent_ribo'] = np.sum(
    D326_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Biop_Int1.X, axis=1).A1
D326_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D326_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [153]:
sc.pp.filter_cells(D326_Biop_Int1, min_genes=500)
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['n_counts'] < 25000, :]
D326_Biop_Int1 = D326_Biop_Int1[D326_Biop_Int1.obs['percent_mito'] < 0.3, :]
filtered out 11 cells that have less than 500 genes expressed
In [154]:
# scrublet
scrub = scr.Scrublet(D326_Biop_Int1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Biop_Int1.obs['doublet_scores'] = doublet_scores
D326_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.11
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 16.1%
Overall doublet rate:
	Expected   = 1.1%
	Estimated  = 4.1%
Elapsed time: 0.7 seconds
Out[154]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb019b240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb646c240>],
       dtype=object))
In [155]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Biop_Int1.X).predict()
D326_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21250677108764648 seconds
Jaccard graph constructed in 0.3636820316314697 seconds
Wrote graph to binary file in 0.02783966064453125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872555
After 2 runs, maximum modularity is Q = 0.876168
Louvain completed 22 runs in 1.3342251777648926 seconds
PhenoGraph complete in 1.9531424045562744 seconds
Found communities [-1, ... 16], with sizes: [245, 265, 183, 152, 106, 72, 70, 69, 64, 60, 55, 40, 35, 31, 29, 28, 15, 13]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21496820449829102 seconds
Jaccard graph constructed in 0.37735795974731445 seconds
Wrote graph to binary file in 0.026920080184936523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871503
Louvain completed 21 runs in 1.1651508808135986 seconds
PhenoGraph complete in 1.794358730316162 seconds
Found communities [-1, ... 14], with sizes: [228, 280, 179, 152, 142, 84, 83, 83, 65, 65, 42, 36, 29, 29, 23, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.206953763961792 seconds
Jaccard graph constructed in 0.42186832427978516 seconds
Wrote graph to binary file in 0.022740840911865234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875249
Louvain completed 21 runs in 1.135221242904663 seconds
PhenoGraph complete in 1.798593521118164 seconds
Found communities [-1, ... 14], with sizes: [244, 257, 201, 170, 103, 88, 82, 66, 62, 56, 44, 43, 38, 29, 26, 23]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21445083618164062 seconds
Jaccard graph constructed in 0.43778324127197266 seconds
Wrote graph to binary file in 0.02095937728881836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.870239
After 5 runs, maximum modularity is Q = 0.871775
Louvain completed 25 runs in 1.5078670978546143 seconds
PhenoGraph complete in 2.190139055252075 seconds
Found communities [-1, ... 16], with sizes: [207, 257, 197, 133, 121, 91, 87, 70, 56, 50, 47, 46, 39, 32, 32, 29, 26, 12]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20624947547912598 seconds
Jaccard graph constructed in 0.3668174743652344 seconds
Wrote graph to binary file in 0.25129103660583496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87048
After 2 runs, maximum modularity is Q = 0.872262
Louvain completed 22 runs in 1.370293140411377 seconds
PhenoGraph complete in 2.2099478244781494 seconds
Found communities [-1, ... 13], with sizes: [226, 240, 234, 188, 120, 81, 76, 75, 68, 62, 62, 37, 28, 20, 15]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21539044380187988 seconds
Jaccard graph constructed in 0.3713223934173584 seconds
Wrote graph to binary file in 0.03494429588317871 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875937
Louvain completed 21 runs in 1.1226935386657715 seconds
PhenoGraph complete in 1.7589333057403564 seconds
Found communities [-1, ... 16], with sizes: [232, 260, 166, 113, 103, 87, 82, 78, 75, 60, 59, 53, 45, 37, 27, 24, 16, 15]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21239542961120605 seconds
Jaccard graph constructed in 0.4254121780395508 seconds
Wrote graph to binary file in 0.025604963302612305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.873557
Louvain completed 21 runs in 1.123821496963501 seconds
PhenoGraph complete in 1.7998547554016113 seconds
Found communities [-1, ... 14], with sizes: [233, 290, 165, 124, 105, 94, 81, 81, 75, 69, 55, 41, 40, 28, 27, 24]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2119286060333252 seconds
Jaccard graph constructed in 0.3564941883087158 seconds
Wrote graph to binary file in 0.030817031860351562 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872046
After 2 runs, maximum modularity is Q = 0.873677
After 4 runs, maximum modularity is Q = 0.875019
Louvain completed 24 runs in 1.707763910293579 seconds
PhenoGraph complete in 2.3293588161468506 seconds
Found communities [-1, ... 16], with sizes: [223, 282, 188, 180, 121, 104, 76, 50, 45, 43, 37, 35, 30, 28, 28, 27, 23, 12]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21380305290222168 seconds
Jaccard graph constructed in 0.3793606758117676 seconds
Wrote graph to binary file in 0.04333972930908203 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871621
After 3 runs, maximum modularity is Q = 0.8731
Louvain completed 23 runs in 1.4189717769622803 seconds
PhenoGraph complete in 2.067413806915283 seconds
Found communities [-1, ... 14], with sizes: [229, 359, 175, 124, 121, 83, 73, 70, 59, 56, 50, 36, 32, 28, 25, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21209287643432617 seconds
Jaccard graph constructed in 0.3780078887939453 seconds
Wrote graph to binary file in 0.03366875648498535 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876966
Louvain completed 21 runs in 1.1465270519256592 seconds
PhenoGraph complete in 1.7857649326324463 seconds
Found communities [-1, ... 18], with sizes: [234, 245, 180, 142, 91, 86, 71, 69, 57, 51, 46, 41, 39, 39, 39, 27, 26, 26, 12, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21363592147827148 seconds
Jaccard graph constructed in 0.37618112564086914 seconds
Wrote graph to binary file in 0.2508823871612549 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875494
Louvain completed 21 runs in 1.1351969242095947 seconds
PhenoGraph complete in 1.9862525463104248 seconds
Found communities [-1, ... 14], with sizes: [243, 263, 190, 152, 137, 86, 82, 65, 62, 61, 57, 53, 32, 27, 11, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21294927597045898 seconds
Jaccard graph constructed in 0.40344810485839844 seconds
Wrote graph to binary file in 0.02588486671447754 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868243
Louvain completed 21 runs in 1.1395483016967773 seconds
PhenoGraph complete in 1.7931420803070068 seconds
Found communities [-1, ... 13], with sizes: [243, 264, 180, 169, 109, 101, 85, 80, 64, 64, 51, 39, 31, 27, 25]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20701980590820312 seconds
Jaccard graph constructed in 0.37091708183288574 seconds
Wrote graph to binary file in 0.031915903091430664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872594
Louvain completed 21 runs in 1.142817735671997 seconds
PhenoGraph complete in 1.767308235168457 seconds
Found communities [-1, ... 14], with sizes: [242, 292, 171, 167, 125, 86, 81, 74, 52, 50, 42, 38, 37, 31, 30, 14]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21493244171142578 seconds
Jaccard graph constructed in 0.3666503429412842 seconds
Wrote graph to binary file in 0.03907918930053711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869055
Louvain completed 21 runs in 1.132624626159668 seconds
PhenoGraph complete in 1.7644095420837402 seconds
Found communities [-1, ... 16], with sizes: [224, 278, 205, 134, 126, 84, 72, 60, 60, 58, 52, 43, 38, 29, 24, 23, 11, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.215256929397583 seconds
Jaccard graph constructed in 0.43773579597473145 seconds
Wrote graph to binary file in 0.026654720306396484 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.868166
After 6 runs, maximum modularity is Q = 0.869605
Louvain completed 26 runs in 1.5238149166107178 seconds
PhenoGraph complete in 2.2181923389434814 seconds
Found communities [-1, ... 14], with sizes: [229, 260, 182, 145, 120, 114, 94, 84, 79, 40, 39, 39, 37, 28, 21, 21]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21446871757507324 seconds
Jaccard graph constructed in 0.37337732315063477 seconds
Wrote graph to binary file in 0.05069231986999512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.872011
After 5 runs, maximum modularity is Q = 0.873111
Louvain completed 25 runs in 1.4956026077270508 seconds
PhenoGraph complete in 2.1447155475616455 seconds
Found communities [-1, ... 16], with sizes: [262, 237, 201, 140, 116, 92, 78, 70, 55, 43, 40, 37, 35, 29, 29, 26, 21, 21]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21381545066833496 seconds
Jaccard graph constructed in 0.3762679100036621 seconds
Wrote graph to binary file in 0.03141474723815918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.871991
Louvain completed 21 runs in 1.1394329071044922 seconds
PhenoGraph complete in 1.7746365070343018 seconds
Found communities [-1, ... 16], with sizes: [220, 252, 223, 157, 93, 90, 82, 70, 66, 56, 55, 36, 35, 26, 25, 21, 13, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21435928344726562 seconds
Jaccard graph constructed in 0.36531615257263184 seconds
Wrote graph to binary file in 0.24006342887878418 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869831
After 2 runs, maximum modularity is Q = 0.871462
Louvain completed 22 runs in 1.3819866180419922 seconds
PhenoGraph complete in 2.2126288414001465 seconds
Found communities [-1, ... 15], with sizes: [227, 268, 176, 150, 87, 85, 85, 79, 77, 71, 58, 45, 35, 27, 22, 21, 19]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21565794944763184 seconds
Jaccard graph constructed in 0.4142270088195801 seconds
Wrote graph to binary file in 0.06245684623718262 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87048
After 11 runs, maximum modularity is Q = 0.871603
Louvain completed 31 runs in 2.010638952255249 seconds
PhenoGraph complete in 2.71702241897583 seconds
Found communities [-1, ... 13], with sizes: [250, 248, 184, 164, 113, 97, 75, 74, 73, 67, 55, 38, 36, 29, 29]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.215773344039917 seconds
Jaccard graph constructed in 0.3864133358001709 seconds
Wrote graph to binary file in 0.04292774200439453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865432
Louvain completed 21 runs in 1.1393427848815918 seconds
PhenoGraph complete in 1.7990338802337646 seconds
Found communities [-1, ... 15], with sizes: [230, 245, 207, 174, 115, 93, 84, 82, 80, 46, 35, 29, 27, 26, 23, 23, 13]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21304655075073242 seconds
Jaccard graph constructed in 0.36087560653686523 seconds
Wrote graph to binary file in 0.03091287612915039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878844
Louvain completed 21 runs in 1.176285982131958 seconds
PhenoGraph complete in 1.794126272201538 seconds
Found communities [-1, ... 13], with sizes: [247, 282, 166, 159, 158, 87, 81, 69, 66, 54, 46, 36, 28, 27, 26]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21434879302978516 seconds
Jaccard graph constructed in 0.3636810779571533 seconds
Wrote graph to binary file in 0.03107428550720215 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.874058
After 6 runs, maximum modularity is Q = 0.875109
Louvain completed 26 runs in 1.6364972591400146 seconds
PhenoGraph complete in 2.2630796432495117 seconds
Found communities [-1, ... 16], with sizes: [258, 260, 184, 96, 78, 78, 77, 74, 64, 60, 59, 58, 51, 35, 32, 28, 22, 18]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21393156051635742 seconds
Jaccard graph constructed in 0.42297863960266113 seconds
Wrote graph to binary file in 0.025583982467651367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877464
Louvain completed 21 runs in 1.1184568405151367 seconds
PhenoGraph complete in 1.7980844974517822 seconds
Found communities [-1, ... 16], with sizes: [185, 257, 236, 128, 120, 80, 78, 73, 73, 65, 57, 34, 33, 29, 28, 26, 18, 12]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.206801176071167 seconds
Jaccard graph constructed in 0.4006519317626953 seconds
Wrote graph to binary file in 0.03341984748840332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876441
Louvain completed 21 runs in 1.320981740951538 seconds
PhenoGraph complete in 1.974416732788086 seconds
Found communities [-1, ... 13], with sizes: [226, 258, 236, 138, 115, 89, 77, 71, 71, 68, 56, 39, 35, 30, 23]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20917606353759766 seconds
Jaccard graph constructed in 0.372455358505249 seconds
Wrote graph to binary file in 0.03403520584106445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869474
Louvain completed 21 runs in 1.1219048500061035 seconds
PhenoGraph complete in 1.748931884765625 seconds
Found communities [-1, ... 14], with sizes: [256, 281, 185, 154, 118, 86, 75, 73, 69, 54, 43, 34, 28, 27, 25, 24]

In [156]:
sc.pp.normalize_per_cell(D326_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Biop_Int1) # log transform the data
D326_Biop_Int1.raw = D326_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [157]:
D326_Biop_Int1 = D326_Biop_Int1[:, D326_Biop_Int1.var['ribo_genes']]
D326_Biop_Int1
Out[157]:
View of AnnData object with n_obs × n_vars = 1226 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [158]:
D339_Biop_Int1 = sc.read_10x_mtx(
    './D339_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D339_Biop_Int1.var_names_make_unique()
D339_Biop_Int1.obs['manip'] = 'D339_Biop_Int1'
D339_Biop_Int1.obs['position'] = 'Intermediate'
D339_Biop_Int1.obs['method'] = 'Biopsy'
D339_Biop_Int1.obs['donor'] = 'D339'
D339_Biop_Int1.obs['name'] = ['D339_Biop_Int1_' + s for s in list(D339_Biop_Int1.obs.index)]
D339_Biop_Int1.obs_names = D339_Biop_Int1.obs['name']
D339_Biop_Int1
... reading from cache file ./cache/D339_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[158]:
AnnData object with n_obs × n_vars = 3348 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [159]:
sc.pl.highest_expr_genes(D339_Biop_Int1, n_top=20)
In [160]:
sc.pp.filter_cells(D339_Biop_Int1, min_genes=0)
mito_genes = D339_Biop_Int1.var_names.str.startswith('MT-')
D339_Biop_Int1.obs['percent_mito'] = np.sum(
    D339_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.obs['n_counts'] = D339_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Biop_Int1.to_df())
ribo_genes = D339_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D339_Biop_Int1.obs['percent_ribo'] = np.sum(
    D339_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Biop_Int1.X, axis=1).A1
D339_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D339_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [161]:
sc.pp.filter_cells(D339_Biop_Int1, min_genes=500)
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['n_counts'] < 30000, :]
D339_Biop_Int1 = D339_Biop_Int1[D339_Biop_Int1.obs['percent_mito'] < 0.15, :]
filtered out 9 cells that have less than 500 genes expressed
In [162]:
# scrublet
scrub = scr.Scrublet(D339_Biop_Int1.X, expected_doublet_rate=0.026)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Biop_Int1.obs['doublet_scores'] = doublet_scores
D339_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.32
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 10.6%
Overall doublet rate:
	Expected   = 2.6%
	Estimated  = 3.7%
Elapsed time: 2.3 seconds
Out[162]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9ab0a20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb0147860>],
       dtype=object))
In [163]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Biop_Int1.X).predict()
D339_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6120181083679199 seconds
Jaccard graph constructed in 0.6159374713897705 seconds
Wrote graph to binary file in 0.06875061988830566 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910447
Louvain completed 21 runs in 1.5944838523864746 seconds
PhenoGraph complete in 2.907468795776367 seconds
Found communities [-1, ... 27], with sizes: [111, 718, 381, 364, 245, 229, 189, 186, 176, 133, 132, 128, 122, 122, 119, 117, 108, 96, 80, 71, 57, 54, 48, 46, 34, 28, 28, 22, 18]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7108383178710938 seconds
Jaccard graph constructed in 0.5960428714752197 seconds
Wrote graph to binary file in 0.2878448963165283 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912211
Louvain completed 21 runs in 1.5562129020690918 seconds
PhenoGraph complete in 3.165850877761841 seconds
Found communities [-1, ... 25], with sizes: [99, 720, 376, 366, 342, 228, 218, 177, 167, 162, 136, 129, 123, 120, 118, 116, 110, 86, 74, 50, 49, 48, 48, 36, 28, 22, 14]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8106706142425537 seconds
Jaccard graph constructed in 0.7108263969421387 seconds
Wrote graph to binary file in 0.08755302429199219 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909643
Louvain completed 21 runs in 1.5580415725708008 seconds
PhenoGraph complete in 3.1868462562561035 seconds
Found communities [-1, ... 22], with sizes: [112, 760, 412, 365, 348, 344, 265, 180, 174, 160, 158, 135, 118, 117, 112, 83, 62, 52, 51, 46, 41, 33, 20, 14]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7121298313140869 seconds
Jaccard graph constructed in 0.8929822444915771 seconds
Wrote graph to binary file in 0.07616424560546875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912137
Louvain completed 21 runs in 1.5405664443969727 seconds
PhenoGraph complete in 3.241511106491089 seconds
Found communities [-1, ... 23], with sizes: [107, 789, 460, 405, 373, 342, 230, 171, 167, 156, 131, 124, 106, 90, 82, 77, 74, 64, 46, 46, 41, 29, 18, 18, 16]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8125925064086914 seconds
Jaccard graph constructed in 0.6645140647888184 seconds
Wrote graph to binary file in 0.24959516525268555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911793
Louvain completed 21 runs in 1.548281192779541 seconds
PhenoGraph complete in 3.296985149383545 seconds
Found communities [-1, ... 23], with sizes: [121, 730, 382, 369, 337, 331, 213, 178, 158, 146, 140, 130, 123, 120, 119, 113, 91, 79, 63, 60, 59, 42, 30, 14, 14]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7121119499206543 seconds
Jaccard graph constructed in 0.6611349582672119 seconds
Wrote graph to binary file in 0.07588648796081543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91247
Louvain completed 21 runs in 1.5824344158172607 seconds
PhenoGraph complete in 3.051260471343994 seconds
Found communities [-1, ... 25], with sizes: [79, 786, 456, 362, 319, 309, 194, 186, 177, 170, 163, 149, 133, 87, 82, 79, 67, 62, 59, 54, 40, 33, 29, 29, 22, 19, 17]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8096714019775391 seconds
Jaccard graph constructed in 0.6530439853668213 seconds
Wrote graph to binary file in 0.28012943267822266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913229
Louvain completed 21 runs in 1.5736398696899414 seconds
PhenoGraph complete in 3.3335275650024414 seconds
Found communities [-1, ... 26], with sizes: [130, 722, 365, 361, 359, 272, 240, 238, 191, 140, 140, 126, 120, 115, 105, 91, 82, 59, 47, 47, 44, 43, 31, 30, 20, 17, 15, 12]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.811260461807251 seconds
Jaccard graph constructed in 0.6973435878753662 seconds
Wrote graph to binary file in 0.0801093578338623 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912733
Louvain completed 21 runs in 1.5724260807037354 seconds
PhenoGraph complete in 3.179330587387085 seconds
Found communities [-1, ... 24], with sizes: [107, 762, 383, 376, 335, 329, 209, 179, 178, 171, 128, 127, 116, 112, 106, 84, 80, 64, 64, 62, 53, 44, 33, 27, 18, 15]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8147614002227783 seconds
Jaccard graph constructed in 0.6282577514648438 seconds
Wrote graph to binary file in 0.07687115669250488 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909985
After 3 runs, maximum modularity is Q = 0.911042
Louvain completed 23 runs in 1.9309871196746826 seconds
PhenoGraph complete in 3.4667675495147705 seconds
Found communities [-1, ... 24], with sizes: [97, 741, 376, 367, 342, 305, 243, 194, 193, 161, 140, 121, 115, 113, 108, 103, 92, 83, 66, 51, 41, 36, 35, 16, 12, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8121042251586914 seconds
Jaccard graph constructed in 0.6219890117645264 seconds
Wrote graph to binary file in 0.26953816413879395 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910983
Louvain completed 21 runs in 1.5791409015655518 seconds
PhenoGraph complete in 3.3028202056884766 seconds
Found communities [-1, ... 26], with sizes: [98, 747, 363, 357, 347, 209, 208, 183, 164, 161, 148, 147, 129, 123, 115, 104, 100, 83, 65, 53, 47, 47, 45, 36, 32, 19, 18, 14]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8197469711303711 seconds
Jaccard graph constructed in 0.665412425994873 seconds
Wrote graph to binary file in 0.27089881896972656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910454
Louvain completed 21 runs in 1.58369779586792 seconds
PhenoGraph complete in 3.355931282043457 seconds
Found communities [-1, ... 23], with sizes: [124, 743, 364, 364, 347, 266, 238, 234, 203, 155, 144, 132, 118, 105, 101, 90, 85, 82, 62, 60, 44, 43, 29, 17, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7104980945587158 seconds
Jaccard graph constructed in 0.608994722366333 seconds
Wrote graph to binary file in 0.0742805004119873 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909149
After 2 runs, maximum modularity is Q = 0.910778
Louvain completed 22 runs in 1.8385138511657715 seconds
PhenoGraph complete in 3.2510221004486084 seconds
Found communities [-1, ... 23], with sizes: [122, 721, 467, 394, 364, 262, 246, 219, 201, 150, 138, 133, 131, 114, 110, 82, 53, 47, 42, 40, 39, 37, 20, 17, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7156820297241211 seconds
Jaccard graph constructed in 0.6979537010192871 seconds
Wrote graph to binary file in 0.08060169219970703 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909846
Louvain completed 21 runs in 1.7623915672302246 seconds
PhenoGraph complete in 3.2784769535064697 seconds
Found communities [-1, ... 22], with sizes: [118, 711, 397, 376, 362, 321, 268, 255, 173, 148, 135, 123, 107, 106, 103, 82, 77, 74, 62, 53, 43, 29, 20, 19]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7170310020446777 seconds
Jaccard graph constructed in 0.8543190956115723 seconds
Wrote graph to binary file in 0.07794928550720215 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907781
After 10 runs, maximum modularity is Q = 0.908969
Louvain completed 30 runs in 2.3561477661132812 seconds
PhenoGraph complete in 4.0231993198394775 seconds
Found communities [-1, ... 23], with sizes: [141, 726, 394, 368, 345, 323, 266, 221, 187, 143, 136, 128, 117, 110, 100, 80, 75, 67, 56, 47, 40, 33, 29, 16, 14]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8115262985229492 seconds
Jaccard graph constructed in 0.6196231842041016 seconds
Wrote graph to binary file in 0.27695226669311523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909351
After 18 runs, maximum modularity is Q = 0.910505
Louvain completed 38 runs in 2.8731987476348877 seconds
PhenoGraph complete in 4.599445819854736 seconds
Found communities [-1, ... 23], with sizes: [123, 719, 397, 382, 367, 367, 268, 216, 159, 137, 129, 123, 117, 117, 85, 83, 78, 61, 54, 53, 41, 30, 22, 19, 15]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7128534317016602 seconds
Jaccard graph constructed in 0.6222641468048096 seconds
Wrote graph to binary file in 0.07622623443603516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913185
After 3 runs, maximum modularity is Q = 0.914432
Louvain completed 23 runs in 1.8979613780975342 seconds
PhenoGraph complete in 3.325296640396118 seconds
Found communities [-1, ... 23], with sizes: [131, 749, 377, 362, 330, 250, 239, 231, 175, 166, 153, 141, 128, 117, 111, 100, 79, 74, 56, 47, 41, 38, 34, 18, 15]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8107166290283203 seconds
Jaccard graph constructed in 0.6339046955108643 seconds
Wrote graph to binary file in 0.2551889419555664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909121
Louvain completed 21 runs in 1.600229263305664 seconds
PhenoGraph complete in 3.315030813217163 seconds
Found communities [-1, ... 22], with sizes: [137, 790, 363, 361, 353, 341, 331, 221, 144, 141, 121, 116, 114, 113, 112, 81, 61, 60, 53, 47, 39, 31, 17, 15]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7117364406585693 seconds
Jaccard graph constructed in 0.6509895324707031 seconds
Wrote graph to binary file in 0.07445740699768066 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908248
After 2 runs, maximum modularity is Q = 0.90925
After 9 runs, maximum modularity is Q = 0.910292
Louvain completed 29 runs in 2.4937171936035156 seconds
PhenoGraph complete in 3.946629047393799 seconds
Found communities [-1, ... 24], with sizes: [146, 757, 370, 334, 307, 237, 201, 201, 173, 172, 157, 153, 148, 139, 133, 110, 80, 65, 63, 63, 42, 28, 27, 21, 18, 17]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8095879554748535 seconds
Jaccard graph constructed in 0.6299993991851807 seconds
Wrote graph to binary file in 0.07435059547424316 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910371
After 2 runs, maximum modularity is Q = 0.911783
Louvain completed 22 runs in 1.8568203449249268 seconds
PhenoGraph complete in 3.386380910873413 seconds
Found communities [-1, ... 22], with sizes: [105, 718, 447, 402, 365, 279, 224, 188, 184, 175, 174, 128, 126, 97, 87, 86, 82, 54, 49, 45, 45, 38, 32, 32]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7113001346588135 seconds
Jaccard graph constructed in 0.791698694229126 seconds
Wrote graph to binary file in 0.07625126838684082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911401
Louvain completed 21 runs in 1.565434455871582 seconds
PhenoGraph complete in 3.159489870071411 seconds
Found communities [-1, ... 24], with sizes: [150, 697, 409, 357, 338, 216, 206, 182, 173, 150, 134, 134, 128, 126, 119, 109, 102, 85, 81, 65, 60, 43, 41, 26, 16, 15]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8119308948516846 seconds
Jaccard graph constructed in 0.601323127746582 seconds
Wrote graph to binary file in 0.24887347221374512 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911418
After 12 runs, maximum modularity is Q = 0.912584
Louvain completed 32 runs in 2.530770778656006 seconds
PhenoGraph complete in 4.207291603088379 seconds
Found communities [-1, ... 26], with sizes: [130, 747, 385, 363, 335, 237, 224, 221, 169, 161, 156, 149, 119, 109, 89, 89, 81, 62, 59, 54, 47, 42, 30, 29, 28, 19, 16, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8119664192199707 seconds
Jaccard graph constructed in 0.6528275012969971 seconds
Wrote graph to binary file in 0.07746100425720215 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909115
After 3 runs, maximum modularity is Q = 0.910285
Louvain completed 23 runs in 1.9448442459106445 seconds
PhenoGraph complete in 3.5069580078125 seconds
Found communities [-1, ... 25], with sizes: [118, 722, 375, 346, 337, 319, 201, 186, 177, 168, 158, 148, 127, 116, 113, 83, 74, 66, 51, 49, 48, 46, 38, 28, 25, 25, 18]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7134723663330078 seconds
Jaccard graph constructed in 0.628441572189331 seconds
Wrote graph to binary file in 0.28258347511291504 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91328
After 4 runs, maximum modularity is Q = 0.914467
Louvain completed 24 runs in 2.0038208961486816 seconds
PhenoGraph complete in 3.643587350845337 seconds
Found communities [-1, ... 24], with sizes: [98, 768, 368, 361, 340, 231, 204, 186, 178, 172, 162, 154, 122, 120, 107, 102, 95, 85, 69, 51, 49, 44, 37, 25, 18, 16]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7119970321655273 seconds
Jaccard graph constructed in 0.6226019859313965 seconds
Wrote graph to binary file in 0.07487988471984863 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911548
After 13 runs, maximum modularity is Q = 0.912586
Louvain completed 33 runs in 2.5868496894836426 seconds
PhenoGraph complete in 4.013131618499756 seconds
Found communities [-1, ... 23], with sizes: [112, 807, 373, 369, 326, 290, 254, 194, 191, 183, 163, 136, 133, 115, 109, 81, 59, 54, 47, 47, 38, 33, 19, 16, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8108634948730469 seconds
Jaccard graph constructed in 0.6436583995819092 seconds
Wrote graph to binary file in 0.07364439964294434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906352
After 5 runs, maximum modularity is Q = 0.907726
Louvain completed 25 runs in 2.038503408432007 seconds
PhenoGraph complete in 3.588808298110962 seconds
Found communities [-1, ... 23], with sizes: [106, 697, 480, 386, 362, 259, 245, 186, 170, 161, 145, 135, 106, 103, 94, 83, 74, 69, 66, 59, 47, 45, 35, 34, 15]

In [164]:
sc.pp.normalize_per_cell(D339_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Biop_Int1) # log transform the data
D339_Biop_Int1.raw = D339_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [165]:
D339_Biop_Int1 = D339_Biop_Int1[:, D339_Biop_Int1.var['ribo_genes']]
D339_Biop_Int1
Out[165]:
View of AnnData object with n_obs × n_vars = 3330 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [166]:
D344_Biop_Int1 = sc.read_10x_mtx(
    './D344_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D344_Biop_Int1.var_names_make_unique()
D344_Biop_Int1.obs['manip'] = 'D344_Biop_Int1'
D344_Biop_Int1.obs['position'] = 'Intermediate'
D344_Biop_Int1.obs['method'] = 'Biopsy'
D344_Biop_Int1.obs['donor'] = 'D344'
D344_Biop_Int1.obs['name'] = ['D344_Biop_Int1_' + s for s in list(D344_Biop_Int1.obs.index)]
D344_Biop_Int1.obs_names = D344_Biop_Int1.obs['name']
D344_Biop_Int1
... reading from cache file ./cache/D344_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[166]:
AnnData object with n_obs × n_vars = 1051 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [167]:
sc.pl.highest_expr_genes(D344_Biop_Int1, n_top=20)
In [168]:
sc.pp.filter_cells(D344_Biop_Int1, min_genes=0)
mito_genes = D344_Biop_Int1.var_names.str.startswith('MT-')
D344_Biop_Int1.obs['percent_mito'] = np.sum(
    D344_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.obs['n_counts'] = D344_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Biop_Int1.to_df())
ribo_genes = D344_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D344_Biop_Int1.obs['percent_ribo'] = np.sum(
    D344_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Biop_Int1.X, axis=1).A1
D344_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D344_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [169]:
sc.pp.filter_cells(D344_Biop_Int1, min_genes=500)
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['n_counts'] < 10000, :]
D344_Biop_Int1 = D344_Biop_Int1[D344_Biop_Int1.obs['percent_mito'] < 0.1, :]
filtered out 27 cells that have less than 500 genes expressed
In [170]:
# scrublet
scrub = scr.Scrublet(D344_Biop_Int1.X, expected_doublet_rate=0.008)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Biop_Int1.obs['doublet_scores'] = doublet_scores
D344_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.07
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 34.1%
Overall doublet rate:
	Expected   = 0.8%
	Estimated  = 1.2%
Elapsed time: 0.4 seconds
Out[170]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9f3ec88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eaa00e128>],
       dtype=object))
In [171]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Biop_Int1.X).predict()
D344_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21145963668823242 seconds
Jaccard graph constructed in 0.3449239730834961 seconds
Wrote graph to binary file in 0.021248817443847656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880122
Louvain completed 21 runs in 1.1646614074707031 seconds
PhenoGraph complete in 1.7518534660339355 seconds
Found communities [-1, ... 14], with sizes: [185, 297, 150, 118, 87, 80, 67, 43, 39, 39, 37, 36, 33, 14, 13, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1107933521270752 seconds
Jaccard graph constructed in 0.3332080841064453 seconds
Wrote graph to binary file in 0.01955890655517578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87899
Louvain completed 21 runs in 1.208862066268921 seconds
PhenoGraph complete in 1.6861658096313477 seconds
Found communities [-1, ... 12], with sizes: [178, 258, 213, 180, 83, 79, 55, 46, 41, 41, 33, 18, 14, 11]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10710954666137695 seconds
Jaccard graph constructed in 0.3355743885040283 seconds
Wrote graph to binary file in 0.01905989646911621 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880221
Louvain completed 21 runs in 1.21779203414917 seconds
PhenoGraph complete in 1.6892426013946533 seconds
Found communities [-1, ... 13], with sizes: [156, 262, 160, 149, 109, 60, 59, 51, 43, 40, 39, 39, 38, 33, 12]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10601449012756348 seconds
Jaccard graph constructed in 0.3769807815551758 seconds
Wrote graph to binary file in 0.019910573959350586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882943
Louvain completed 21 runs in 1.262915849685669 seconds
PhenoGraph complete in 1.7776107788085938 seconds
Found communities [-1, ... 12], with sizes: [193, 296, 197, 162, 78, 56, 54, 50, 41, 39, 34, 28, 11, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10669279098510742 seconds
Jaccard graph constructed in 0.3457822799682617 seconds
Wrote graph to binary file in 0.0184783935546875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878726
Louvain completed 21 runs in 1.3269686698913574 seconds
PhenoGraph complete in 1.8088159561157227 seconds
Found communities [-1, ... 15], with sizes: [185, 258, 173, 160, 80, 59, 55, 45, 41, 37, 37, 33, 26, 22, 17, 11, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10851693153381348 seconds
Jaccard graph constructed in 0.37672901153564453 seconds
Wrote graph to binary file in 0.04572415351867676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882706
Louvain completed 21 runs in 1.1272990703582764 seconds
PhenoGraph complete in 1.6748719215393066 seconds
Found communities [-1, ... 14], with sizes: [216, 282, 182, 142, 60, 57, 46, 38, 36, 36, 36, 35, 32, 26, 14, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11346316337585449 seconds
Jaccard graph constructed in 0.343994140625 seconds
Wrote graph to binary file in 0.27814316749572754 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.877563
Louvain completed 21 runs in 1.1701586246490479 seconds
PhenoGraph complete in 1.9211146831512451 seconds
Found communities [-1, ... 12], with sizes: [180, 305, 207, 157, 58, 57, 46, 46, 42, 39, 33, 33, 32, 15]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1146395206451416 seconds
Jaccard graph constructed in 0.3721346855163574 seconds
Wrote graph to binary file in 0.03134775161743164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878177
Louvain completed 21 runs in 1.1684675216674805 seconds
PhenoGraph complete in 1.6974670886993408 seconds
Found communities [-1, ... 11], with sizes: [172, 278, 213, 187, 76, 65, 59, 50, 38, 34, 34, 33, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11188626289367676 seconds
Jaccard graph constructed in 0.35836315155029297 seconds
Wrote graph to binary file in 0.03089141845703125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880741
Louvain completed 21 runs in 1.1703009605407715 seconds
PhenoGraph complete in 1.704502820968628 seconds
Found communities [-1, ... 13], with sizes: [227, 259, 191, 156, 59, 53, 47, 44, 40, 39, 37, 36, 33, 17, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11103987693786621 seconds
Jaccard graph constructed in 0.3810455799102783 seconds
Wrote graph to binary file in 0.0421299934387207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87909
Louvain completed 21 runs in 1.150974988937378 seconds
PhenoGraph complete in 1.6959278583526611 seconds
Found communities [-1, ... 12], with sizes: [183, 263, 188, 170, 75, 63, 58, 50, 41, 38, 37, 36, 35, 13]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10722994804382324 seconds
Jaccard graph constructed in 0.38977789878845215 seconds
Wrote graph to binary file in 0.03473496437072754 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880149
Louvain completed 21 runs in 1.3495912551879883 seconds
PhenoGraph complete in 1.8937199115753174 seconds
Found communities [-1, ... 12], with sizes: [215, 280, 156, 153, 85, 66, 55, 52, 50, 45, 38, 33, 11, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10716843605041504 seconds
Jaccard graph constructed in 0.36541080474853516 seconds
Wrote graph to binary file in 0.03273296356201172 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880606
Louvain completed 21 runs in 1.1485414505004883 seconds
PhenoGraph complete in 1.663989543914795 seconds
Found communities [-1, ... 12], with sizes: [172, 310, 177, 157, 78, 56, 47, 44, 40, 39, 39, 37, 32, 22]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10730195045471191 seconds
Jaccard graph constructed in 0.3606255054473877 seconds
Wrote graph to binary file in 0.030358552932739258 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876205
Louvain completed 21 runs in 1.1489663124084473 seconds
PhenoGraph complete in 1.65879487991333 seconds
Found communities [-1, ... 10], with sizes: [189, 281, 202, 164, 78, 73, 72, 45, 41, 36, 36, 33]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11252951622009277 seconds
Jaccard graph constructed in 0.35149550437927246 seconds
Wrote graph to binary file in 0.03131890296936035 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88077
Louvain completed 21 runs in 1.1583356857299805 seconds
PhenoGraph complete in 1.6760649681091309 seconds
Found communities [-1, ... 12], with sizes: [178, 305, 158, 157, 80, 71, 53, 51, 42, 40, 38, 36, 22, 19]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10689330101013184 seconds
Jaccard graph constructed in 0.3616914749145508 seconds
Wrote graph to binary file in 0.03846144676208496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878707
Louvain completed 21 runs in 1.153308391571045 seconds
PhenoGraph complete in 1.6763839721679688 seconds
Found communities [-1, ... 11], with sizes: [196, 284, 167, 138, 113, 75, 56, 52, 42, 36, 36, 32, 23]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10843348503112793 seconds
Jaccard graph constructed in 0.37569642066955566 seconds
Wrote graph to binary file in 0.05348014831542969 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87889
Louvain completed 21 runs in 1.142681360244751 seconds
PhenoGraph complete in 1.6932039260864258 seconds
Found communities [-1, ... 12], with sizes: [174, 277, 158, 157, 117, 71, 69, 48, 38, 36, 36, 34, 24, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10575532913208008 seconds
Jaccard graph constructed in 0.36235928535461426 seconds
Wrote graph to binary file in 0.028481245040893555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882936
Louvain completed 21 runs in 1.1972129344940186 seconds
PhenoGraph complete in 1.7038843631744385 seconds
Found communities [-1, ... 13], with sizes: [196, 314, 162, 160, 56, 54, 49, 41, 40, 40, 40, 39, 33, 13, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11168026924133301 seconds
Jaccard graph constructed in 0.3540842533111572 seconds
Wrote graph to binary file in 0.27155423164367676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.881863
Louvain completed 21 runs in 1.1280722618103027 seconds
PhenoGraph complete in 1.8748860359191895 seconds
Found communities [-1, ... 13], with sizes: [208, 289, 156, 98, 78, 68, 63, 56, 46, 46, 41, 37, 33, 20, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11098194122314453 seconds
Jaccard graph constructed in 0.4335055351257324 seconds
Wrote graph to binary file in 0.022338390350341797 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.876497
Louvain completed 21 runs in 1.1879448890686035 seconds
PhenoGraph complete in 1.7635250091552734 seconds
Found communities [-1, ... 12], with sizes: [187, 276, 180, 166, 80, 76, 54, 50, 42, 38, 36, 31, 18, 16]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11251401901245117 seconds
Jaccard graph constructed in 0.3694918155670166 seconds
Wrote graph to binary file in 0.03946948051452637 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87899
Louvain completed 21 runs in 1.168576717376709 seconds
PhenoGraph complete in 1.7103748321533203 seconds
Found communities [-1, ... 12], with sizes: [199, 297, 198, 167, 86, 47, 42, 40, 36, 34, 33, 32, 23, 16]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11192011833190918 seconds
Jaccard graph constructed in 0.3561713695526123 seconds
Wrote graph to binary file in 0.02805948257446289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88075
Louvain completed 21 runs in 1.1818695068359375 seconds
PhenoGraph complete in 1.6874995231628418 seconds
Found communities [-1, ... 13], with sizes: [146, 297, 179, 160, 74, 69, 60, 55, 47, 45, 41, 33, 16, 15, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1140294075012207 seconds
Jaccard graph constructed in 0.4274132251739502 seconds
Wrote graph to binary file in 0.021744966506958008 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88156
Louvain completed 21 runs in 1.1601512432098389 seconds
PhenoGraph complete in 1.7319869995117188 seconds
Found communities [-1, ... 11], with sizes: [198, 283, 206, 178, 90, 50, 42, 42, 41, 35, 35, 33, 17]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11182069778442383 seconds
Jaccard graph constructed in 0.35143613815307617 seconds
Wrote graph to binary file in 0.03067612648010254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875879
Louvain completed 21 runs in 1.1906437873840332 seconds
PhenoGraph complete in 1.6964304447174072 seconds
Found communities [-1, ... 10], with sizes: [203, 282, 183, 174, 93, 76, 72, 41, 34, 34, 33, 25]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11396241188049316 seconds
Jaccard graph constructed in 0.35881948471069336 seconds
Wrote graph to binary file in 0.03446149826049805 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87648
Louvain completed 21 runs in 1.1733543872833252 seconds
PhenoGraph complete in 1.6909277439117432 seconds
Found communities [-1, ... 9], with sizes: [171, 308, 205, 179, 80, 76, 67, 48, 47, 35, 34]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1117711067199707 seconds
Jaccard graph constructed in 0.36495161056518555 seconds
Wrote graph to binary file in 0.0581965446472168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.878107
Louvain completed 21 runs in 1.1425848007202148 seconds
PhenoGraph complete in 1.690443992614746 seconds
Found communities [-1, ... 12], with sizes: [192, 305, 194, 182, 71, 49, 42, 41, 38, 37, 32, 31, 23, 13]

In [172]:
sc.pp.normalize_per_cell(D344_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Biop_Int1) # log transform the data
D344_Biop_Int1.raw = D344_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [173]:
D344_Biop_Int1 = D344_Biop_Int1[:, D344_Biop_Int1.var['ribo_genes']]
D344_Biop_Int1
Out[173]:
View of AnnData object with n_obs × n_vars = 1000 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [174]:
D353_Biop_Int2 = sc.read_10x_mtx(
    './D353_Biop_Int2/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D353_Biop_Int2.var_names_make_unique()
D353_Biop_Int2.obs['manip'] = 'D353_Biop_Int2'
D353_Biop_Int2.obs['position'] = 'Intermediate'
D353_Biop_Int2.obs['method'] = 'Biopsy'
D353_Biop_Int2.obs['donor'] = 'D353'
D353_Biop_Int2.obs['name'] = ['D353_Biop_Int2_' + s for s in list(D353_Biop_Int2.obs.index)]
D353_Biop_Int2.obs_names = D353_Biop_Int2.obs['name']
D353_Biop_Int2
... reading from cache file ./cache/D353_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[174]:
AnnData object with n_obs × n_vars = 2291 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [175]:
sc.pl.highest_expr_genes(D353_Biop_Int2, n_top=20)
In [176]:
sc.pp.filter_cells(D353_Biop_Int2, min_genes=0)
mito_genes = D353_Biop_Int2.var_names.str.startswith('MT-')
D353_Biop_Int2.obs['percent_mito'] = np.sum(
    D353_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.obs['n_counts'] = D353_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Biop_Int2.to_df())
ribo_genes = D353_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D353_Biop_Int2.obs['percent_ribo'] = np.sum(
    D353_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Biop_Int2.X, axis=1).A1
D353_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D353_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [177]:
sc.pp.filter_cells(D353_Biop_Int2, min_genes=500)
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['n_counts'] < 10000, :]
D353_Biop_Int2 = D353_Biop_Int2[D353_Biop_Int2.obs['percent_mito'] < 0.15, :]
filtered out 72 cells that have less than 500 genes expressed
In [178]:
# scrublet
scrub = scr.Scrublet(D353_Biop_Int2.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Biop_Int2.obs['doublet_scores'] = doublet_scores
D353_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.21
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 14.3%
Overall doublet rate:
	Expected   = 1.8%
	Estimated  = 3.8%
Elapsed time: 1.1 seconds
Out[178]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9933cf8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb95dc9e8>],
       dtype=object))
In [179]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Biop_Int2.X).predict()
D353_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4091353416442871 seconds
Jaccard graph constructed in 0.6043984889984131 seconds
Wrote graph to binary file in 0.03512072563171387 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890691
Louvain completed 21 runs in 1.3956241607666016 seconds
PhenoGraph complete in 2.4567246437072754 seconds
Found communities [-1, ... 15], with sizes: [228, 1040, 693, 144, 95, 79, 78, 61, 60, 46, 44, 39, 38, 28, 26, 24, 22]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4080648422241211 seconds
Jaccard graph constructed in 0.5825128555297852 seconds
Wrote graph to binary file in 0.03633427619934082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893712
Louvain completed 21 runs in 1.4647831916809082 seconds
PhenoGraph complete in 2.5091614723205566 seconds
Found communities [-1, ... 16], with sizes: [222, 1040, 356, 332, 134, 95, 81, 78, 76, 65, 50, 48, 35, 34, 30, 26, 23, 20]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40767621994018555 seconds
Jaccard graph constructed in 0.5647685527801514 seconds
Wrote graph to binary file in 0.03490257263183594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893473
Louvain completed 21 runs in 1.440371036529541 seconds
PhenoGraph complete in 2.4670298099517822 seconds
Found communities [-1, ... 15], with sizes: [194, 1066, 496, 228, 139, 90, 80, 78, 78, 67, 47, 47, 35, 33, 28, 25, 14]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4076704978942871 seconds
Jaccard graph constructed in 0.539881706237793 seconds
Wrote graph to binary file in 0.292696475982666 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891316
After 6 runs, maximum modularity is Q = 0.892575
Louvain completed 26 runs in 2.03838849067688 seconds
PhenoGraph complete in 3.292109489440918 seconds
Found communities [-1, ... 15], with sizes: [278, 1029, 358, 343, 132, 86, 81, 75, 59, 58, 49, 45, 41, 31, 30, 26, 24]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.517542839050293 seconds
Jaccard graph constructed in 0.5562949180603027 seconds
Wrote graph to binary file in 0.03730154037475586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895413
Louvain completed 21 runs in 1.4487016201019287 seconds
PhenoGraph complete in 2.579259157180786 seconds
Found communities [-1, ... 17], with sizes: [177, 1055, 568, 179, 136, 101, 81, 76, 67, 47, 41, 38, 38, 35, 34, 26, 20, 15, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5140190124511719 seconds
Jaccard graph constructed in 0.5885076522827148 seconds
Wrote graph to binary file in 0.03614401817321777 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890801
Louvain completed 21 runs in 1.4445881843566895 seconds
PhenoGraph complete in 2.5949974060058594 seconds
Found communities [-1, ... 16], with sizes: [247, 1060, 357, 324, 123, 101, 99, 78, 68, 46, 43, 41, 41, 28, 25, 24, 21, 19]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5123090744018555 seconds
Jaccard graph constructed in 0.555894136428833 seconds
Wrote graph to binary file in 0.03653597831726074 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893846
Louvain completed 21 runs in 1.4240570068359375 seconds
PhenoGraph complete in 2.542696714401245 seconds
Found communities [-1, ... 15], with sizes: [240, 999, 384, 373, 116, 91, 87, 84, 73, 64, 47, 45, 41, 35, 28, 23, 15]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4090099334716797 seconds
Jaccard graph constructed in 0.5889580249786377 seconds
Wrote graph to binary file in 0.035860300064086914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896264
Louvain completed 21 runs in 1.4150359630584717 seconds
PhenoGraph complete in 2.462589979171753 seconds
Found communities [-1, ... 16], with sizes: [190, 1051, 369, 353, 154, 78, 75, 68, 66, 63, 47, 44, 39, 38, 35, 31, 28, 16]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40950894355773926 seconds
Jaccard graph constructed in 0.8809564113616943 seconds
Wrote graph to binary file in 0.03651571273803711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89772
Louvain completed 21 runs in 1.3908169269561768 seconds
PhenoGraph complete in 2.7316763401031494 seconds
Found communities [-1, ... 15], with sizes: [205, 1043, 367, 352, 136, 103, 86, 82, 78, 61, 47, 45, 43, 32, 29, 25, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.511075496673584 seconds
Jaccard graph constructed in 0.5945179462432861 seconds
Wrote graph to binary file in 0.035491943359375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887438
Louvain completed 21 runs in 1.40632963180542 seconds
PhenoGraph complete in 2.562772274017334 seconds
Found communities [-1, ... 16], with sizes: [246, 988, 499, 261, 114, 89, 87, 76, 72, 65, 48, 43, 40, 40, 25, 24, 17, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5103023052215576 seconds
Jaccard graph constructed in 0.5586209297180176 seconds
Wrote graph to binary file in 0.03708052635192871 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893353
Louvain completed 21 runs in 1.405564308166504 seconds
PhenoGraph complete in 2.523803949356079 seconds
Found communities [-1, ... 15], with sizes: [215, 1133, 363, 285, 149, 85, 79, 73, 68, 67, 52, 42, 39, 31, 26, 25, 13]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4121861457824707 seconds
Jaccard graph constructed in 0.5916233062744141 seconds
Wrote graph to binary file in 0.036779165267944336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.892858
Louvain completed 21 runs in 1.4294745922088623 seconds
PhenoGraph complete in 2.4855849742889404 seconds
Found communities [-1, ... 17], with sizes: [192, 1026, 384, 361, 114, 104, 77, 72, 71, 63, 53, 46, 43, 28, 28, 27, 24, 16, 16]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5090172290802002 seconds
Jaccard graph constructed in 0.5467166900634766 seconds
Wrote graph to binary file in 0.2511570453643799 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896091
Louvain completed 21 runs in 1.407749891281128 seconds
PhenoGraph complete in 2.729004383087158 seconds
Found communities [-1, ... 17], with sizes: [233, 1007, 380, 349, 155, 92, 79, 78, 59, 55, 46, 41, 35, 32, 29, 26, 23, 14, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41003990173339844 seconds
Jaccard graph constructed in 0.5629570484161377 seconds
Wrote graph to binary file in 0.05934643745422363 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89884
Louvain completed 21 runs in 1.4473161697387695 seconds
PhenoGraph complete in 2.4936909675598145 seconds
Found communities [-1, ... 16], with sizes: [194, 1021, 535, 230, 145, 92, 77, 71, 68, 68, 51, 43, 36, 33, 30, 28, 12, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4100019931793213 seconds
Jaccard graph constructed in 0.5669147968292236 seconds
Wrote graph to binary file in 0.03788328170776367 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895837
Louvain completed 21 runs in 1.384540319442749 seconds
PhenoGraph complete in 2.417604684829712 seconds
Found communities [-1, ... 16], with sizes: [212, 1009, 432, 337, 114, 102, 85, 77, 66, 57, 47, 42, 41, 39, 27, 26, 20, 12]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4081096649169922 seconds
Jaccard graph constructed in 0.5636246204376221 seconds
Wrote graph to binary file in 0.03511691093444824 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891539
Louvain completed 21 runs in 1.3966548442840576 seconds
PhenoGraph complete in 2.418062448501587 seconds
Found communities [-1, ... 14], with sizes: [169, 1191, 396, 205, 145, 96, 91, 89, 78, 61, 47, 47, 46, 29, 28, 27]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4078061580657959 seconds
Jaccard graph constructed in 0.5556800365447998 seconds
Wrote graph to binary file in 0.25050973892211914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895265
Louvain completed 21 runs in 1.4412474632263184 seconds
PhenoGraph complete in 2.671192169189453 seconds
Found communities [-1, ... 15], with sizes: [193, 1115, 679, 106, 88, 79, 76, 73, 72, 47, 46, 35, 32, 31, 29, 22, 22]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4085850715637207 seconds
Jaccard graph constructed in 0.5557441711425781 seconds
Wrote graph to binary file in 0.03228044509887695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889227
After 7 runs, maximum modularity is Q = 0.890463
Louvain completed 27 runs in 1.9141814708709717 seconds
PhenoGraph complete in 2.9237189292907715 seconds
Found communities [-1, ... 14], with sizes: [231, 1022, 373, 367, 130, 94, 92, 75, 75, 66, 52, 48, 32, 31, 30, 27]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5096926689147949 seconds
Jaccard graph constructed in 0.6055824756622314 seconds
Wrote graph to binary file in 0.04024457931518555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896855
After 2 runs, maximum modularity is Q = 0.898196
Louvain completed 22 runs in 1.6625196933746338 seconds
PhenoGraph complete in 2.8347229957580566 seconds
Found communities [-1, ... 15], with sizes: [226, 1080, 335, 320, 152, 97, 77, 77, 77, 60, 50, 46, 43, 41, 27, 26, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4100353717803955 seconds
Jaccard graph constructed in 0.5605344772338867 seconds
Wrote graph to binary file in 0.034934043884277344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886007
After 17 runs, maximum modularity is Q = 0.887034
Louvain completed 37 runs in 2.4867916107177734 seconds
PhenoGraph complete in 3.5050947666168213 seconds
Found communities [-1, ... 16], with sizes: [205, 1021, 392, 368, 137, 126, 98, 78, 62, 46, 36, 31, 31, 25, 25, 25, 25, 14]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.509087324142456 seconds
Jaccard graph constructed in 0.5716879367828369 seconds
Wrote graph to binary file in 0.034432172775268555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89655
Louvain completed 21 runs in 1.4189376831054688 seconds
PhenoGraph complete in 2.5461370944976807 seconds
Found communities [-1, ... 15], with sizes: [217, 1048, 361, 343, 133, 96, 86, 84, 80, 65, 48, 40, 36, 30, 29, 25, 24]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4082791805267334 seconds
Jaccard graph constructed in 0.7765345573425293 seconds
Wrote graph to binary file in 0.03472542762756348 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893439
Louvain completed 21 runs in 1.4554274082183838 seconds
PhenoGraph complete in 2.6879711151123047 seconds
Found communities [-1, ... 14], with sizes: [216, 1238, 515, 131, 104, 83, 80, 79, 62, 48, 41, 37, 36, 25, 25, 25]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41226720809936523 seconds
Jaccard graph constructed in 0.5759696960449219 seconds
Wrote graph to binary file in 0.03444170951843262 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.894657
Louvain completed 21 runs in 1.3930928707122803 seconds
PhenoGraph complete in 2.4279897212982178 seconds
Found communities [-1, ... 15], with sizes: [179, 1038, 380, 379, 158, 87, 85, 77, 75, 57, 47, 39, 35, 32, 29, 28, 20]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4079315662384033 seconds
Jaccard graph constructed in 0.5689308643341064 seconds
Wrote graph to binary file in 0.034728050231933594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897891
Louvain completed 21 runs in 1.4391758441925049 seconds
PhenoGraph complete in 2.466240882873535 seconds
Found communities [-1, ... 15], with sizes: [206, 1038, 521, 197, 154, 88, 86, 83, 80, 58, 46, 45, 38, 32, 31, 28, 14]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4070625305175781 seconds
Jaccard graph constructed in 0.5509645938873291 seconds
Wrote graph to binary file in 0.03459048271179199 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.896833
Louvain completed 21 runs in 1.3844196796417236 seconds
PhenoGraph complete in 2.3938353061676025 seconds
Found communities [-1, ... 18], with sizes: [185, 1065, 366, 338, 117, 85, 77, 75, 75, 65, 48, 41, 36, 32, 27, 26, 26, 25, 20, 16]

In [180]:
sc.pp.normalize_per_cell(D353_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Biop_Int2) # log transform the data
D353_Biop_Int2.raw = D353_Biop_Int2 # freeze the object (for later use of the raw state of it)
In [181]:
D353_Biop_Int2 = D353_Biop_Int2[:, D353_Biop_Int2.var['ribo_genes']]
D353_Biop_Int2
Out[181]:
View of AnnData object with n_obs × n_vars = 2196 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [182]:
D354_Biop_Int2 = sc.read_10x_mtx(
    './D354_Biop_Int2/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D354_Biop_Int2.var_names_make_unique()
D354_Biop_Int2.obs['manip'] = 'D354_Biop_Int2'
D354_Biop_Int2.obs['position'] = 'Intermediate'
D354_Biop_Int2.obs['method'] = 'Biopsy'
D354_Biop_Int2.obs['donor'] = 'D354'
D354_Biop_Int2.obs['name'] = ['D354_Biop_Int2_' + s for s in list(D354_Biop_Int2.obs.index)]
D354_Biop_Int2.obs_names = D354_Biop_Int2.obs['name']
D354_Biop_Int2
... reading from cache file ./cache/D354_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[182]:
AnnData object with n_obs × n_vars = 2775 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [183]:
sc.pl.highest_expr_genes(D354_Biop_Int2, n_top=20)
In [184]:
sc.pp.filter_cells(D354_Biop_Int2, min_genes=0)
mito_genes = D354_Biop_Int2.var_names.str.startswith('MT-')
D354_Biop_Int2.obs['percent_mito'] = np.sum(
    D354_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.obs['n_counts'] = D354_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Biop_Int2.to_df())
ribo_genes = D354_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D354_Biop_Int2.obs['percent_ribo'] = np.sum(
    D354_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Biop_Int2.X, axis=1).A1
D354_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D354_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [185]:
sc.pp.filter_cells(D354_Biop_Int2, min_genes=500)
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['n_counts'] < 20000, :]
D354_Biop_Int2 = D354_Biop_Int2[D354_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 51 cells that have less than 500 genes expressed
In [186]:
# scrublet
scrub = scr.Scrublet(D354_Biop_Int2.X, expected_doublet_rate=0.022)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Biop_Int2.obs['doublet_scores'] = doublet_scores
D354_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.26
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 14.5%
Overall doublet rate:
	Expected   = 2.2%
	Estimated  = 4.1%
Elapsed time: 1.6 seconds
Out[186]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1eb525b3c8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9f3bc9e8>],
       dtype=object))
In [187]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Biop_Int2.X).predict()
D354_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6089456081390381 seconds
Jaccard graph constructed in 0.5493769645690918 seconds
Wrote graph to binary file in 0.047583818435668945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903417
After 3 runs, maximum modularity is Q = 0.904579
Louvain completed 23 runs in 1.9260752201080322 seconds
PhenoGraph complete in 3.146135091781616 seconds
Found communities [-1, ... 20], with sizes: [243, 1162, 328, 305, 257, 185, 178, 161, 96, 77, 74, 46, 45, 41, 35, 31, 29, 25, 25, 17, 16, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6099333763122559 seconds
Jaccard graph constructed in 0.5788750648498535 seconds
Wrote graph to binary file in 0.04790472984313965 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906312
After 13 runs, maximum modularity is Q = 0.907492
Louvain completed 33 runs in 2.5305252075195312 seconds
PhenoGraph complete in 3.7877907752990723 seconds
Found communities [-1, ... 19], with sizes: [227, 1056, 364, 363, 267, 201, 161, 148, 99, 94, 73, 62, 47, 43, 36, 33, 33, 28, 26, 15, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6165721416473389 seconds
Jaccard graph constructed in 0.583214282989502 seconds
Wrote graph to binary file in 0.2681753635406494 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902316
Louvain completed 21 runs in 1.530259370803833 seconds
PhenoGraph complete in 3.0137455463409424 seconds
Found communities [-1, ... 17], with sizes: [233, 1165, 348, 305, 294, 180, 173, 165, 117, 86, 72, 45, 44, 43, 31, 30, 23, 21, 13]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.61187744140625 seconds
Jaccard graph constructed in 0.6052114963531494 seconds
Wrote graph to binary file in 0.050127506256103516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903096
Louvain completed 21 runs in 1.5214214324951172 seconds
PhenoGraph complete in 2.80751895904541 seconds
Found communities [-1, ... 15], with sizes: [225, 1095, 374, 337, 309, 269, 184, 164, 137, 45, 44, 43, 42, 36, 35, 26, 23]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6149189472198486 seconds
Jaccard graph constructed in 0.6141078472137451 seconds
Wrote graph to binary file in 0.04991936683654785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908502
Louvain completed 21 runs in 1.5285391807556152 seconds
PhenoGraph complete in 2.822324752807617 seconds
Found communities [-1, ... 18], with sizes: [252, 1276, 532, 164, 152, 151, 140, 115, 114, 93, 88, 46, 46, 45, 37, 31, 30, 30, 25, 21]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.611790657043457 seconds
Jaccard graph constructed in 0.5846893787384033 seconds
Wrote graph to binary file in 0.2656435966491699 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903634
After 3 runs, maximum modularity is Q = 0.904811
Louvain completed 23 runs in 1.9008080959320068 seconds
PhenoGraph complete in 3.378009796142578 seconds
Found communities [-1, ... 18], with sizes: [277, 1062, 366, 325, 248, 187, 185, 157, 116, 115, 45, 44, 40, 40, 40, 36, 33, 28, 24, 20]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6172959804534912 seconds
Jaccard graph constructed in 0.5735414028167725 seconds
Wrote graph to binary file in 0.04891705513000488 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899241
After 2 runs, maximum modularity is Q = 0.902945
Louvain completed 22 runs in 1.845940113067627 seconds
PhenoGraph complete in 3.100114107131958 seconds
Found communities [-1, ... 17], with sizes: [244, 1134, 363, 332, 229, 192, 186, 170, 118, 108, 56, 48, 46, 44, 32, 28, 25, 19, 14]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6118593215942383 seconds
Jaccard graph constructed in 0.5889749526977539 seconds
Wrote graph to binary file in 0.046532630920410156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903312
Louvain completed 21 runs in 1.515474557876587 seconds
PhenoGraph complete in 2.7765424251556396 seconds
Found communities [-1, ... 18], with sizes: [287, 1084, 353, 340, 211, 196, 172, 159, 159, 101, 77, 46, 43, 40, 30, 25, 21, 18, 15, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6111633777618408 seconds
Jaccard graph constructed in 0.5823967456817627 seconds
Wrote graph to binary file in 0.288393497467041 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906185
After 3 runs, maximum modularity is Q = 0.907497
Louvain completed 23 runs in 1.8971140384674072 seconds
PhenoGraph complete in 3.393420457839966 seconds
Found communities [-1, ... 17], with sizes: [271, 1082, 409, 299, 254, 212, 173, 156, 110, 109, 48, 45, 44, 41, 35, 34, 26, 22, 18]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.610567569732666 seconds
Jaccard graph constructed in 0.6041157245635986 seconds
Wrote graph to binary file in 0.04715108871459961 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90188
Louvain completed 21 runs in 1.5208909511566162 seconds
PhenoGraph complete in 2.7960779666900635 seconds
Found communities [-1, ... 18], with sizes: [277, 1097, 354, 340, 237, 197, 169, 116, 114, 104, 76, 50, 46, 44, 42, 32, 29, 24, 21, 19]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6107847690582275 seconds
Jaccard graph constructed in 0.5874667167663574 seconds
Wrote graph to binary file in 0.04792428016662598 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901651
After 4 runs, maximum modularity is Q = 0.902827
Louvain completed 24 runs in 1.9363698959350586 seconds
PhenoGraph complete in 3.196136236190796 seconds
Found communities [-1, ... 17], with sizes: [256, 1115, 384, 292, 289, 218, 172, 165, 163, 47, 47, 46, 45, 31, 31, 28, 28, 20, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.612783670425415 seconds
Jaccard graph constructed in 0.5973565578460693 seconds
Wrote graph to binary file in 0.25495386123657227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904834
After 4 runs, maximum modularity is Q = 0.906356
Louvain completed 24 runs in 1.9502596855163574 seconds
PhenoGraph complete in 3.4298713207244873 seconds
Found communities [-1, ... 21], with sizes: [241, 1073, 372, 370, 236, 191, 177, 148, 118, 108, 45, 43, 42, 37, 33, 27, 26, 24, 21, 19, 13, 13, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.517777681350708 seconds
Jaccard graph constructed in 0.5723652839660645 seconds
Wrote graph to binary file in 0.04760384559631348 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899283
Louvain completed 21 runs in 1.5190637111663818 seconds
PhenoGraph complete in 2.6748862266540527 seconds
Found communities [-1, ... 17], with sizes: [249, 1126, 358, 324, 275, 268, 165, 158, 101, 79, 44, 43, 43, 34, 32, 27, 23, 20, 19]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6115403175354004 seconds
Jaccard graph constructed in 0.5914480686187744 seconds
Wrote graph to binary file in 0.046744346618652344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904794
Louvain completed 21 runs in 1.5596249103546143 seconds
PhenoGraph complete in 2.8239617347717285 seconds
Found communities [-1, ... 18], with sizes: [292, 1044, 376, 340, 267, 183, 151, 143, 116, 106, 78, 49, 45, 45, 34, 30, 28, 24, 21, 16]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6164212226867676 seconds
Jaccard graph constructed in 0.5935096740722656 seconds
Wrote graph to binary file in 0.04588150978088379 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.897071
After 2 runs, maximum modularity is Q = 0.898997
Louvain completed 22 runs in 1.897925853729248 seconds
PhenoGraph complete in 3.1674208641052246 seconds
Found communities [-1, ... 18], with sizes: [284, 1277, 471, 235, 208, 173, 168, 166, 62, 51, 46, 41, 41, 39, 31, 31, 21, 17, 14, 12]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6152329444885254 seconds
Jaccard graph constructed in 0.8159730434417725 seconds
Wrote graph to binary file in 0.04668879508972168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901192
After 4 runs, maximum modularity is Q = 0.902879
Louvain completed 24 runs in 1.9839365482330322 seconds
PhenoGraph complete in 3.481740951538086 seconds
Found communities [-1, ... 17], with sizes: [231, 1120, 409, 351, 233, 189, 178, 163, 155, 75, 46, 41, 41, 40, 31, 30, 23, 19, 13]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6104776859283447 seconds
Jaccard graph constructed in 0.5887281894683838 seconds
Wrote graph to binary file in 0.04852747917175293 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902573
Louvain completed 21 runs in 1.5776691436767578 seconds
PhenoGraph complete in 2.8476829528808594 seconds
Found communities [-1, ... 18], with sizes: [266, 1102, 355, 348, 235, 165, 157, 157, 133, 102, 94, 46, 43, 37, 36, 32, 29, 22, 16, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6126482486724854 seconds
Jaccard graph constructed in 0.5858221054077148 seconds
Wrote graph to binary file in 0.04575514793395996 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902961
Louvain completed 21 runs in 1.546755313873291 seconds
PhenoGraph complete in 2.8106656074523926 seconds
Found communities [-1, ... 19], with sizes: [260, 1149, 342, 331, 213, 185, 183, 161, 105, 70, 68, 56, 45, 42, 36, 30, 24, 23, 22, 22, 21]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.60927414894104 seconds
Jaccard graph constructed in 0.5587124824523926 seconds
Wrote graph to binary file in 0.23276758193969727 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903275
Louvain completed 21 runs in 1.5103824138641357 seconds
PhenoGraph complete in 2.924783229827881 seconds
Found communities [-1, ... 15], with sizes: [265, 1095, 354, 337, 274, 208, 188, 149, 117, 110, 75, 45, 43, 42, 33, 31, 22]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6164610385894775 seconds
Jaccard graph constructed in 0.5717639923095703 seconds
Wrote graph to binary file in 0.04487180709838867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902751
After 3 runs, maximum modularity is Q = 0.904469
Louvain completed 23 runs in 1.8957345485687256 seconds
PhenoGraph complete in 3.1415188312530518 seconds
Found communities [-1, ... 17], with sizes: [228, 1115, 386, 329, 232, 230, 180, 132, 114, 105, 73, 45, 44, 37, 36, 31, 25, 23, 23]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6134498119354248 seconds
Jaccard graph constructed in 0.5802252292633057 seconds
Wrote graph to binary file in 0.04542064666748047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901305
Louvain completed 21 runs in 1.515362024307251 seconds
PhenoGraph complete in 2.768453598022461 seconds
Found communities [-1, ... 18], with sizes: [253, 1154, 350, 331, 266, 196, 184, 149, 96, 83, 61, 46, 44, 44, 35, 30, 22, 17, 16, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.615567684173584 seconds
Jaccard graph constructed in 0.5754516124725342 seconds
Wrote graph to binary file in 0.2666294574737549 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905599
After 12 runs, maximum modularity is Q = 0.906617
Louvain completed 32 runs in 2.4448866844177246 seconds
PhenoGraph complete in 3.916701078414917 seconds
Found communities [-1, ... 19], with sizes: [268, 1085, 379, 343, 205, 170, 168, 140, 124, 100, 91, 46, 46, 42, 41, 34, 28, 27, 22, 18, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.612835168838501 seconds
Jaccard graph constructed in 0.5812299251556396 seconds
Wrote graph to binary file in 0.04573798179626465 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90242
Louvain completed 21 runs in 1.488856315612793 seconds
PhenoGraph complete in 2.7458574771881104 seconds
Found communities [-1, ... 17], with sizes: [235, 1072, 512, 269, 254, 202, 184, 153, 120, 116, 45, 42, 35, 29, 25, 24, 24, 24, 23]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.6147117614746094 seconds
Jaccard graph constructed in 0.5668911933898926 seconds
Wrote graph to binary file in 0.04675579071044922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900681
After 3 runs, maximum modularity is Q = 0.901692
Louvain completed 23 runs in 1.8528432846069336 seconds
PhenoGraph complete in 3.095226764678955 seconds
Found communities [-1, ... 19], with sizes: [222, 1049, 531, 262, 244, 174, 169, 150, 107, 105, 92, 45, 42, 37, 33, 28, 26, 22, 20, 16, 14]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.611067533493042 seconds
Jaccard graph constructed in 0.5879242420196533 seconds
Wrote graph to binary file in 0.2729678153991699 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904636
Louvain completed 21 runs in 1.540919303894043 seconds
PhenoGraph complete in 3.02738618850708 seconds
Found communities [-1, ... 18], with sizes: [243, 1080, 376, 357, 254, 241, 187, 135, 111, 68, 46, 46, 39, 36, 36, 33, 27, 25, 25, 23]

In [188]:
sc.pp.normalize_per_cell(D354_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Biop_Int2) # log transform the data
D354_Biop_Int2.raw = D354_Biop_Int2 # freeze the object (for later use of the raw state of it)
In [189]:
D354_Biop_Int2 = D354_Biop_Int2[:, D354_Biop_Int2.var['ribo_genes']]
D354_Biop_Int2
Out[189]:
View of AnnData object with n_obs × n_vars = 2711 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [190]:
D363_Biop_Int2 = sc.read_10x_mtx(
    './D363_Biop_Int2/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D363_Biop_Int2.var_names_make_unique()
D363_Biop_Int2.obs['manip'] = 'D363_Biop_Int2'
D363_Biop_Int2.obs['position'] = 'Intermediate'
D363_Biop_Int2.obs['method'] = 'Biopsy'
D363_Biop_Int2.obs['donor'] = 'D363'
D363_Biop_Int2.obs['name'] = ['D363_Biop_Int2_' + s for s in list(D363_Biop_Int2.obs.index)]
D363_Biop_Int2.obs_names = D363_Biop_Int2.obs['name']
D363_Biop_Int2
... reading from cache file ./cache/D363_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[190]:
AnnData object with n_obs × n_vars = 1290 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [191]:
sc.pl.highest_expr_genes(D363_Biop_Int2, n_top=20)
In [192]:
sc.pp.filter_cells(D363_Biop_Int2, min_genes=0)
mito_genes = D363_Biop_Int2.var_names.str.startswith('MT-')
D363_Biop_Int2.obs['percent_mito'] = np.sum(
    D363_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.obs['n_counts'] = D363_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Biop_Int2.to_df())
ribo_genes = D363_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D363_Biop_Int2.obs['percent_ribo'] = np.sum(
    D363_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Biop_Int2.X, axis=1).A1
D363_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D363_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [193]:
sc.pp.filter_cells(D363_Biop_Int2, min_genes=500)
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['n_counts'] < 15000, :]
D363_Biop_Int2 = D363_Biop_Int2[D363_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 10 cells that have less than 500 genes expressed
In [194]:
# scrublet
scrub = scr.Scrublet(D363_Biop_Int2.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Biop_Int2.obs['doublet_scores'] = doublet_scores
D363_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.11
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 23.5%
Overall doublet rate:
	Expected   = 1.1%
	Estimated  = 3.0%
Elapsed time: 0.7 seconds
Out[194]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea7e4dac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea25aa278>],
       dtype=object))
In [195]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Biop_Int2.X).predict()
D363_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11090898513793945 seconds
Jaccard graph constructed in 0.37721800804138184 seconds
Wrote graph to binary file in 0.019229888916015625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.846293
Louvain completed 21 runs in 1.2453773021697998 seconds
PhenoGraph complete in 1.7667901515960693 seconds
Found communities [-1, ... 15], with sizes: [224, 394, 176, 167, 101, 96, 89, 67, 54, 50, 38, 33, 28, 26, 16, 15, 13]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2131812572479248 seconds
Jaccard graph constructed in 0.3695356845855713 seconds
Wrote graph to binary file in 0.019904375076293945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851624
Louvain completed 21 runs in 1.2541816234588623 seconds
PhenoGraph complete in 1.8761091232299805 seconds
Found communities [-1, ... 11], with sizes: [268, 444, 285, 120, 88, 78, 76, 61, 44, 42, 31, 30, 20]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21228909492492676 seconds
Jaccard graph constructed in 0.4420442581176758 seconds
Wrote graph to binary file in 0.018776893615722656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.848414
Louvain completed 21 runs in 1.2108337879180908 seconds
PhenoGraph complete in 1.894345760345459 seconds
Found communities [-1, ... 13], with sizes: [247, 426, 351, 86, 83, 77, 72, 51, 45, 37, 36, 31, 22, 12, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21122074127197266 seconds
Jaccard graph constructed in 0.3942694664001465 seconds
Wrote graph to binary file in 0.02224588394165039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.845911
Louvain completed 21 runs in 1.2422690391540527 seconds
PhenoGraph complete in 1.8826713562011719 seconds
Found communities [-1, ... 13], with sizes: [278, 447, 246, 135, 98, 72, 58, 45, 41, 41, 38, 29, 23, 18, 18]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20770907402038574 seconds
Jaccard graph constructed in 0.4313623905181885 seconds
Wrote graph to binary file in 0.019024133682250977 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.851339
Louvain completed 21 runs in 1.2186212539672852 seconds
PhenoGraph complete in 1.895219087600708 seconds
Found communities [-1, ... 11], with sizes: [245, 404, 235, 159, 123, 88, 78, 75, 48, 44, 35, 29, 24]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21138715744018555 seconds
Jaccard graph constructed in 0.3624267578125 seconds
Wrote graph to binary file in 0.018446683883666992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.846414
After 7 runs, maximum modularity is Q = 0.84756
Louvain completed 27 runs in 1.71342134475708 seconds
PhenoGraph complete in 2.3191380500793457 seconds
Found communities [-1, ... 14], with sizes: [206, 434, 265, 131, 99, 88, 73, 64, 50, 45, 29, 29, 27, 22, 13, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2106611728668213 seconds
Jaccard graph constructed in 0.4227170944213867 seconds
Wrote graph to binary file in 0.26633191108703613 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.84827
After 2 runs, maximum modularity is Q = 0.849855
After 14 runs, maximum modularity is Q = 0.850876
Louvain completed 34 runs in 2.289544105529785 seconds
PhenoGraph complete in 3.2031667232513428 seconds
Found communities [-1, ... 13], with sizes: [234, 444, 259, 145, 83, 73, 69, 63, 48, 44, 38, 33, 21, 18, 15]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21259045600891113 seconds
Jaccard graph constructed in 0.45606374740600586 seconds
Wrote graph to binary file in 0.02292656898498535 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.848958
Louvain completed 21 runs in 1.2317864894866943 seconds
PhenoGraph complete in 1.9321236610412598 seconds
Found communities [-1, ... 11], with sizes: [229, 438, 256, 147, 110, 85, 83, 69, 44, 42, 34, 29, 21]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21307706832885742 seconds
Jaccard graph constructed in 0.47411417961120605 seconds
Wrote graph to binary file in 0.0263826847076416 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.847751
After 2 runs, maximum modularity is Q = 0.848798
Louvain completed 22 runs in 1.5304622650146484 seconds
PhenoGraph complete in 2.2559304237365723 seconds
Found communities [-1, ... 13], with sizes: [209, 408, 256, 125, 109, 86, 80, 72, 53, 47, 43, 34, 28, 24, 13]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21260690689086914 seconds
Jaccard graph constructed in 0.45042896270751953 seconds
Wrote graph to binary file in 0.022719144821166992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.854347
Louvain completed 21 runs in 1.2281692028045654 seconds
PhenoGraph complete in 1.9236557483673096 seconds
Found communities [-1, ... 12], with sizes: [232, 422, 278, 153, 94, 84, 75, 68, 45, 43, 31, 31, 20, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21191763877868652 seconds
Jaccard graph constructed in 0.4443166255950928 seconds
Wrote graph to binary file in 0.02356243133544922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.84917
Louvain completed 21 runs in 1.2576515674591064 seconds
PhenoGraph complete in 1.950350046157837 seconds
Found communities [-1, ... 14], with sizes: [218, 436, 260, 137, 98, 77, 69, 66, 44, 40, 38, 33, 21, 21, 18, 11]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21306586265563965 seconds
Jaccard graph constructed in 0.4184701442718506 seconds
Wrote graph to binary file in 0.02998185157775879 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.848609
Louvain completed 21 runs in 1.3128526210784912 seconds
PhenoGraph complete in 1.9944593906402588 seconds
Found communities [-1, ... 13], with sizes: [217, 425, 310, 104, 97, 75, 73, 68, 45, 41, 37, 34, 26, 21, 14]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21082472801208496 seconds
Jaccard graph constructed in 0.4482419490814209 seconds
Wrote graph to binary file in 0.022317171096801758 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.852943
After 2 runs, maximum modularity is Q = 0.85555
After 13 runs, maximum modularity is Q = 0.856678
Louvain completed 33 runs in 2.200714349746704 seconds
PhenoGraph complete in 2.8916006088256836 seconds
Found communities [-1, ... 14], with sizes: [203, 330, 266, 155, 122, 91, 83, 68, 65, 51, 47, 29, 26, 23, 16, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20737504959106445 seconds
Jaccard graph constructed in 0.4564363956451416 seconds
Wrote graph to binary file in 0.025628089904785156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.845279
Louvain completed 21 runs in 1.4998633861541748 seconds
PhenoGraph complete in 2.2094199657440186 seconds
Found communities [-1, ... 13], with sizes: [193, 404, 254, 171, 85, 81, 78, 78, 56, 42, 42, 38, 32, 20, 13]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21024131774902344 seconds
Jaccard graph constructed in 0.44397902488708496 seconds
Wrote graph to binary file in 0.023524999618530273 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.842858
After 2 runs, maximum modularity is Q = 0.844347
Louvain completed 22 runs in 1.4769408702850342 seconds
PhenoGraph complete in 2.1658098697662354 seconds
Found communities [-1, ... 12], with sizes: [234, 463, 234, 147, 90, 82, 79, 78, 45, 40, 30, 23, 22, 20]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21358656883239746 seconds
Jaccard graph constructed in 0.4476747512817383 seconds
Wrote graph to binary file in 0.247084379196167 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.846232
Louvain completed 21 runs in 1.250878095626831 seconds
PhenoGraph complete in 2.168264150619507 seconds
Found communities [-1, ... 12], with sizes: [248, 420, 333, 102, 100, 78, 67, 53, 46, 40, 35, 28, 25, 12]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2079613208770752 seconds
Jaccard graph constructed in 0.41254353523254395 seconds
Wrote graph to binary file in 0.02986741065979004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.84982
Louvain completed 21 runs in 1.2374930381774902 seconds
PhenoGraph complete in 1.904066801071167 seconds
Found communities [-1, ... 15], with sizes: [192, 432, 268, 132, 101, 94, 85, 76, 47, 29, 25, 24, 23, 21, 14, 12, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21439218521118164 seconds
Jaccard graph constructed in 0.4584996700286865 seconds
Wrote graph to binary file in 0.022737979888916016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.850532
Louvain completed 21 runs in 1.2434029579162598 seconds
PhenoGraph complete in 1.948211908340454 seconds
Found communities [-1, ... 14], with sizes: [221, 416, 260, 124, 107, 91, 72, 57, 49, 44, 33, 28, 27, 24, 20, 14]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2076733112335205 seconds
Jaccard graph constructed in 0.45208048820495605 seconds
Wrote graph to binary file in 0.022835969924926758 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.85432
Louvain completed 21 runs in 1.2385921478271484 seconds
PhenoGraph complete in 1.9312427043914795 seconds
Found communities [-1, ... 14], with sizes: [229, 425, 269, 125, 97, 95, 74, 54, 49, 45, 38, 28, 21, 15, 12, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21299219131469727 seconds
Jaccard graph constructed in 0.4493439197540283 seconds
Wrote graph to binary file in 0.024905681610107422 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.847756
After 16 runs, maximum modularity is Q = 0.848802
Louvain completed 36 runs in 2.1637072563171387 seconds
PhenoGraph complete in 2.8618645668029785 seconds
Found communities [-1, ... 14], with sizes: [252, 419, 230, 147, 94, 94, 73, 71, 46, 37, 31, 30, 23, 18, 11, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21243715286254883 seconds
Jaccard graph constructed in 0.4398818016052246 seconds
Wrote graph to binary file in 0.022123098373413086 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.850926
Louvain completed 21 runs in 1.2408523559570312 seconds
PhenoGraph complete in 1.9251196384429932 seconds
Found communities [-1, ... 14], with sizes: [203, 420, 278, 121, 99, 78, 77, 63, 51, 48, 36, 33, 28, 25, 14, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21517205238342285 seconds
Jaccard graph constructed in 0.48202037811279297 seconds
Wrote graph to binary file in 0.02966141700744629 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.845053
Louvain completed 21 runs in 1.3954365253448486 seconds
PhenoGraph complete in 2.1333401203155518 seconds
Found communities [-1, ... 12], with sizes: [185, 411, 262, 164, 97, 90, 89, 60, 52, 44, 41, 40, 29, 23]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21347546577453613 seconds
Jaccard graph constructed in 0.39672136306762695 seconds
Wrote graph to binary file in 0.028667449951171875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.849292
Louvain completed 21 runs in 1.2357735633850098 seconds
PhenoGraph complete in 1.886756181716919 seconds
Found communities [-1, ... 16], with sizes: [219, 416, 262, 152, 102, 76, 63, 62, 44, 43, 28, 26, 25, 21, 13, 12, 12, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21207165718078613 seconds
Jaccard graph constructed in 0.4134039878845215 seconds
Wrote graph to binary file in 0.0380253791809082 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.845069
After 2 runs, maximum modularity is Q = 0.846646
Louvain completed 22 runs in 1.5019104480743408 seconds
PhenoGraph complete in 2.177577257156372 seconds
Found communities [-1, ... 13], with sizes: [236, 422, 240, 181, 97, 77, 72, 61, 48, 39, 28, 28, 23, 22, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2114720344543457 seconds
Jaccard graph constructed in 0.6479077339172363 seconds
Wrote graph to binary file in 0.032082557678222656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.850514
After 5 runs, maximum modularity is Q = 0.851661
Louvain completed 25 runs in 1.6499974727630615 seconds
PhenoGraph complete in 2.5567567348480225 seconds
Found communities [-1, ... 12], with sizes: [210, 408, 287, 184, 100, 84, 69, 60, 41, 39, 31, 30, 24, 20]

In [196]:
sc.pp.normalize_per_cell(D363_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Biop_Int2) # log transform the data
D363_Biop_Int2.raw = D363_Biop_Int2 # freeze the object (for later use of the raw state of it)
In [197]:
D363_Biop_Int2 = D363_Biop_Int2[:, D363_Biop_Int2.var['ribo_genes']]
D363_Biop_Int2
Out[197]:
View of AnnData object with n_obs × n_vars = 1270 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [198]:
D367_Biop_Int1 = sc.read_10x_mtx(
    './D367_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D367_Biop_Int1.var_names_make_unique()
D367_Biop_Int1.obs['manip'] = 'D367_Biop_Int1'
D367_Biop_Int1.obs['position'] = 'Intermediate'
D367_Biop_Int1.obs['method'] = 'Biopsy'
D367_Biop_Int1.obs['donor'] = 'D367'
D367_Biop_Int1.obs['name'] = ['D367_Biop_Int1_' + s for s in list(D367_Biop_Int1.obs.index)]
D367_Biop_Int1.obs_names = D367_Biop_Int1.obs['name']
D367_Biop_Int1
... reading from cache file ./cache/D367_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[198]:
AnnData object with n_obs × n_vars = 2310 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [199]:
sc.pl.highest_expr_genes(D367_Biop_Int1, n_top=20)
In [200]:
sc.pp.filter_cells(D367_Biop_Int1, min_genes=0)
mito_genes = D367_Biop_Int1.var_names.str.startswith('MT-')
D367_Biop_Int1.obs['percent_mito'] = np.sum(
    D367_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.obs['n_counts'] = D367_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Biop_Int1.to_df())
ribo_genes = D367_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D367_Biop_Int1.obs['percent_ribo'] = np.sum(
    D367_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Biop_Int1.X, axis=1).A1
D367_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D367_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [201]:
sc.pp.filter_cells(D367_Biop_Int1, min_genes=500)
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['n_counts'] < 20000, :]
D367_Biop_Int1 = D367_Biop_Int1[D367_Biop_Int1.obs['percent_mito'] < 0.1, :]
filtered out 12 cells that have less than 500 genes expressed
In [202]:
# scrublet
scrub = scr.Scrublet(D367_Biop_Int1.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Biop_Int1.obs['doublet_scores'] = doublet_scores
D367_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.21
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 26.6%
Overall doublet rate:
	Expected   = 1.8%
	Estimated  = 2.3%
Elapsed time: 1.3 seconds
Out[202]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea865b828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea236df60>],
       dtype=object))
In [203]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Biop_Int1.X).predict()
D367_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.514470100402832 seconds
Jaccard graph constructed in 0.5651981830596924 seconds
Wrote graph to binary file in 0.038552045822143555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890353
Louvain completed 21 runs in 1.4390356540679932 seconds
PhenoGraph complete in 2.5758235454559326 seconds
Found communities [-1, ... 20], with sizes: [262, 591, 509, 257, 243, 166, 134, 130, 84, 68, 65, 62, 57, 50, 40, 30, 18, 18, 15, 13, 12, 11]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5079054832458496 seconds
Jaccard graph constructed in 0.5689420700073242 seconds
Wrote graph to binary file in 0.27757787704467773 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885757
Louvain completed 21 runs in 1.4813103675842285 seconds
PhenoGraph complete in 2.8528902530670166 seconds
Found communities [-1, ... 20], with sizes: [290, 586, 534, 247, 186, 131, 126, 126, 116, 73, 71, 68, 65, 50, 39, 36, 23, 17, 13, 13, 13, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4097909927368164 seconds
Jaccard graph constructed in 0.5726964473724365 seconds
Wrote graph to binary file in 0.042246103286743164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89165
Louvain completed 21 runs in 1.4535934925079346 seconds
PhenoGraph complete in 2.489902973175049 seconds
Found communities [-1, ... 20], with sizes: [352, 616, 444, 296, 172, 124, 124, 112, 96, 70, 69, 59, 58, 54, 50, 40, 23, 22, 18, 13, 12, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41039419174194336 seconds
Jaccard graph constructed in 0.5943446159362793 seconds
Wrote graph to binary file in 0.04384565353393555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891646
After 5 runs, maximum modularity is Q = 0.892677
Louvain completed 25 runs in 1.9274344444274902 seconds
PhenoGraph complete in 2.9952476024627686 seconds
Found communities [-1, ... 21], with sizes: [287, 596, 471, 282, 202, 137, 124, 110, 94, 85, 69, 63, 59, 52, 49, 39, 26, 22, 18, 15, 13, 11, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41027235984802246 seconds
Jaccard graph constructed in 0.5733840465545654 seconds
Wrote graph to binary file in 0.042221784591674805 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887762
Louvain completed 21 runs in 1.4715938568115234 seconds
PhenoGraph complete in 2.5102217197418213 seconds
Found communities [-1, ... 18], with sizes: [341, 617, 448, 282, 179, 155, 130, 126, 77, 71, 67, 62, 62, 60, 50, 39, 23, 17, 16, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41303038597106934 seconds
Jaccard graph constructed in 0.5697891712188721 seconds
Wrote graph to binary file in 0.04419088363647461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887707
Louvain completed 21 runs in 1.4782352447509766 seconds
PhenoGraph complete in 2.519321918487549 seconds
Found communities [-1, ... 18], with sizes: [264, 602, 486, 285, 181, 147, 137, 134, 97, 72, 65, 64, 60, 57, 54, 49, 32, 22, 14, 13]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40918612480163574 seconds
Jaccard graph constructed in 0.8528876304626465 seconds
Wrote graph to binary file in 0.04215073585510254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889583
Louvain completed 21 runs in 1.443619966506958 seconds
PhenoGraph complete in 2.760995626449585 seconds
Found communities [-1, ... 19], with sizes: [301, 589, 473, 326, 176, 125, 124, 117, 84, 83, 73, 63, 58, 54, 48, 39, 33, 22, 21, 13, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40827417373657227 seconds
Jaccard graph constructed in 0.5796041488647461 seconds
Wrote graph to binary file in 0.04244089126586914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886688
Louvain completed 21 runs in 1.4799628257751465 seconds
PhenoGraph complete in 2.5271430015563965 seconds
Found communities [-1, ... 17], with sizes: [296, 604, 489, 365, 187, 154, 122, 119, 118, 69, 59, 59, 49, 40, 37, 27, 16, 13, 12]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40851569175720215 seconds
Jaccard graph constructed in 0.5735282897949219 seconds
Wrote graph to binary file in 0.04412055015563965 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89127
After 12 runs, maximum modularity is Q = 0.892354
Louvain completed 32 runs in 2.340996503829956 seconds
PhenoGraph complete in 3.380171060562134 seconds
Found communities [-1, ... 19], with sizes: [332, 568, 516, 288, 194, 155, 151, 125, 69, 62, 61, 57, 56, 50, 39, 27, 26, 25, 12, 11, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4080784320831299 seconds
Jaccard graph constructed in 0.5750257968902588 seconds
Wrote graph to binary file in 0.0429537296295166 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891469
Louvain completed 21 runs in 1.4571173191070557 seconds
PhenoGraph complete in 2.498727560043335 seconds
Found communities [-1, ... 18], with sizes: [301, 631, 478, 297, 233, 150, 148, 101, 72, 68, 58, 56, 49, 49, 39, 31, 27, 22, 13, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4076077938079834 seconds
Jaccard graph constructed in 0.7601118087768555 seconds
Wrote graph to binary file in 0.04154253005981445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883312
After 2 runs, maximum modularity is Q = 0.884909
After 7 runs, maximum modularity is Q = 0.886385
Louvain completed 27 runs in 2.2608542442321777 seconds
PhenoGraph complete in 3.4828410148620605 seconds
Found communities [-1, ... 17], with sizes: [292, 596, 531, 278, 214, 161, 136, 133, 127, 66, 61, 53, 53, 51, 24, 22, 13, 12, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4104807376861572 seconds
Jaccard graph constructed in 0.5960385799407959 seconds
Wrote graph to binary file in 0.04874396324157715 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887172
After 2 runs, maximum modularity is Q = 0.889383
Louvain completed 22 runs in 1.9729182720184326 seconds
PhenoGraph complete in 3.0427701473236084 seconds
Found communities [-1, ... 19], with sizes: [324, 622, 464, 323, 165, 154, 112, 110, 77, 69, 67, 57, 55, 48, 47, 39, 37, 25, 16, 13, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41545677185058594 seconds
Jaccard graph constructed in 0.5627744197845459 seconds
Wrote graph to binary file in 0.04211997985839844 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89236
After 5 runs, maximum modularity is Q = 0.893767
Louvain completed 25 runs in 1.9204761981964111 seconds
PhenoGraph complete in 2.955893039703369 seconds
Found communities [-1, ... 19], with sizes: [244, 604, 480, 341, 163, 138, 136, 122, 119, 72, 64, 60, 54, 54, 47, 39, 29, 26, 16, 14, 13]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40871596336364746 seconds
Jaccard graph constructed in 0.5704357624053955 seconds
Wrote graph to binary file in 0.2707843780517578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889555
Louvain completed 21 runs in 1.4673383235931396 seconds
PhenoGraph complete in 2.73282790184021 seconds
Found communities [-1, ... 20], with sizes: [271, 604, 483, 322, 128, 120, 118, 110, 109, 86, 80, 77, 61, 55, 55, 39, 34, 25, 18, 16, 13, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40848517417907715 seconds
Jaccard graph constructed in 0.5637214183807373 seconds
Wrote graph to binary file in 0.04071211814880371 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888875
Louvain completed 21 runs in 1.4741291999816895 seconds
PhenoGraph complete in 2.5003631114959717 seconds
Found communities [-1, ... 17], with sizes: [290, 596, 505, 318, 178, 172, 152, 129, 126, 75, 62, 61, 48, 41, 25, 18, 13, 13, 13]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4133882522583008 seconds
Jaccard graph constructed in 0.5659253597259521 seconds
Wrote graph to binary file in 0.04111146926879883 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891283
After 4 runs, maximum modularity is Q = 0.892318
After 9 runs, maximum modularity is Q = 0.893359
Louvain completed 29 runs in 2.357012987136841 seconds
PhenoGraph complete in 3.3884224891662598 seconds
Found communities [-1, ... 21], with sizes: [282, 616, 496, 263, 152, 131, 123, 121, 111, 84, 78, 66, 59, 51, 40, 39, 31, 22, 20, 13, 13, 13, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41085362434387207 seconds
Jaccard graph constructed in 0.5608417987823486 seconds
Wrote graph to binary file in 0.04010009765625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886978
Louvain completed 21 runs in 1.4581246376037598 seconds
PhenoGraph complete in 2.4833898544311523 seconds
Found communities [-1, ... 21], with sizes: [330, 588, 503, 269, 137, 135, 129, 124, 110, 83, 79, 61, 54, 50, 40, 39, 24, 16, 15, 13, 13, 12, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4110555648803711 seconds
Jaccard graph constructed in 0.5750141143798828 seconds
Wrote graph to binary file in 0.044264793395996094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886068
After 2 runs, maximum modularity is Q = 0.88852
Louvain completed 22 runs in 1.7303948402404785 seconds
PhenoGraph complete in 2.7754876613616943 seconds
Found communities [-1, ... 19], with sizes: [281, 597, 479, 321, 177, 174, 128, 125, 77, 67, 64, 62, 61, 51, 48, 40, 23, 20, 15, 13, 12]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41065454483032227 seconds
Jaccard graph constructed in 0.5838079452514648 seconds
Wrote graph to binary file in 0.2743256092071533 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888173
Louvain completed 21 runs in 1.4572603702545166 seconds
PhenoGraph complete in 2.741391658782959 seconds
Found communities [-1, ... 17], with sizes: [309, 584, 477, 302, 212, 174, 139, 137, 123, 93, 57, 53, 47, 40, 28, 20, 14, 14, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41254210472106934 seconds
Jaccard graph constructed in 0.5795567035675049 seconds
Wrote graph to binary file in 0.040463924407958984 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888213
Louvain completed 21 runs in 1.458282232284546 seconds
PhenoGraph complete in 2.509772777557373 seconds
Found communities [-1, ... 15], with sizes: [349, 621, 450, 285, 235, 166, 134, 128, 127, 67, 67, 62, 51, 47, 22, 13, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4081566333770752 seconds
Jaccard graph constructed in 0.5740773677825928 seconds
Wrote graph to binary file in 0.039682626724243164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893113
Louvain completed 21 runs in 1.4458818435668945 seconds
PhenoGraph complete in 2.4829251766204834 seconds
Found communities [-1, ... 20], with sizes: [302, 578, 463, 302, 206, 189, 134, 133, 70, 70, 67, 64, 61, 48, 38, 25, 18, 16, 14, 13, 12, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4136834144592285 seconds
Jaccard graph constructed in 0.5601158142089844 seconds
Wrote graph to binary file in 0.041204214096069336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888787
After 2 runs, maximum modularity is Q = 0.890064
Louvain completed 22 runs in 1.7298526763916016 seconds
PhenoGraph complete in 2.7611961364746094 seconds
Found communities [-1, ... 17], with sizes: [300, 607, 503, 279, 235, 173, 126, 125, 123, 62, 55, 54, 51, 50, 24, 22, 18, 15, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40808629989624023 seconds
Jaccard graph constructed in 0.5153343677520752 seconds
Wrote graph to binary file in 0.28693652153015137 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891632
Louvain completed 21 runs in 1.4537806510925293 seconds
PhenoGraph complete in 2.691406011581421 seconds
Found communities [-1, ... 18], with sizes: [299, 606, 498, 333, 192, 125, 122, 113, 112, 73, 70, 59, 56, 48, 39, 25, 20, 17, 15, 13]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4140186309814453 seconds
Jaccard graph constructed in 0.5657174587249756 seconds
Wrote graph to binary file in 0.0407099723815918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882774
Louvain completed 21 runs in 1.4694406986236572 seconds
PhenoGraph complete in 2.505589723587036 seconds
Found communities [-1, ... 17], with sizes: [234, 583, 528, 345, 172, 147, 138, 133, 123, 87, 65, 62, 49, 47, 40, 33, 25, 13, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4104888439178467 seconds
Jaccard graph constructed in 0.5688419342041016 seconds
Wrote graph to binary file in 0.04162716865539551 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890672
Louvain completed 21 runs in 1.441237449645996 seconds
PhenoGraph complete in 2.476186513900757 seconds
Found communities [-1, ... 18], with sizes: [276, 669, 454, 320, 168, 136, 132, 126, 103, 92, 72, 57, 53, 46, 40, 23, 23, 18, 14, 13]

In [204]:
sc.pp.normalize_per_cell(D367_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Biop_Int1) # log transform the data
D367_Biop_Int1.raw = D367_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [205]:
D367_Biop_Int1 = D367_Biop_Int1[:, D367_Biop_Int1.var['ribo_genes']]
D367_Biop_Int1
Out[205]:
View of AnnData object with n_obs × n_vars = 2268 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [206]:
D372_Biop_Int1 = sc.read_10x_mtx(
    './D372_Biop_Int1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D372_Biop_Int1.var_names_make_unique()
D372_Biop_Int1.obs['manip'] = 'D372_Biop_Int1'
D372_Biop_Int1.obs['position'] = 'Intermediate'
D372_Biop_Int1.obs['method'] = 'Biopsy'
D372_Biop_Int1.obs['donor'] = 'D372'
D372_Biop_Int1.obs['name'] = ['D372_Biop_Int1_' + s for s in list(D372_Biop_Int1.obs.index)]
D372_Biop_Int1.obs_names = D372_Biop_Int1.obs['name']
D372_Biop_Int1
... reading from cache file ./cache/D372_Biop_Int1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[206]:
AnnData object with n_obs × n_vars = 1255 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [207]:
sc.pl.highest_expr_genes(D372_Biop_Int1, n_top=20)
In [208]:
sc.pp.filter_cells(D372_Biop_Int1, min_genes=0)
mito_genes = D372_Biop_Int1.var_names.str.startswith('MT-')
D372_Biop_Int1.obs['percent_mito'] = np.sum(
    D372_Biop_Int1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.obs['n_counts'] = D372_Biop_Int1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int1.to_df())
ribo_genes = D372_Biop_Int1.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int1.obs['percent_ribo'] = np.sum(
    D372_Biop_Int1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int1.X, axis=1).A1
D372_Biop_Int1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D372_Biop_Int1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [209]:
sc.pp.filter_cells(D372_Biop_Int1, min_genes=500)
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['n_counts'] < 20000, :]
D372_Biop_Int1 = D372_Biop_Int1[D372_Biop_Int1.obs['percent_mito'] < 0.2, :]
filtered out 8 cells that have less than 500 genes expressed
In [210]:
# scrublet
scrub = scr.Scrublet(D372_Biop_Int1.X, expected_doublet_rate=0.011)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Int1.obs['doublet_scores'] = doublet_scores
D372_Biop_Int1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.11
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 26.9%
Overall doublet rate:
	Expected   = 1.1%
	Estimated  = 1.8%
Elapsed time: 0.6 seconds
Out[210]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1fab1d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea2940550>],
       dtype=object))
In [211]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Int1.X).predict()
D372_Biop_Int1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11031770706176758 seconds
Jaccard graph constructed in 0.4518752098083496 seconds
Wrote graph to binary file in 0.02142500877380371 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865176
After 14 runs, maximum modularity is Q = 0.866203
Louvain completed 34 runs in 2.141688346862793 seconds
PhenoGraph complete in 2.7349908351898193 seconds
Found communities [-1, ... 17], with sizes: [263, 385, 219, 84, 79, 65, 57, 53, 52, 48, 48, 37, 35, 34, 30, 26, 13, 11, 11]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21302247047424316 seconds
Jaccard graph constructed in 0.4364047050476074 seconds
Wrote graph to binary file in 0.30630922317504883 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869238
Louvain completed 21 runs in 1.304117202758789 seconds
PhenoGraph complete in 2.2735512256622314 seconds
Found communities [-1, ... 15], with sizes: [244, 377, 302, 87, 72, 69, 60, 59, 49, 46, 44, 39, 29, 25, 18, 17, 13]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2141258716583252 seconds
Jaccard graph constructed in 0.457120418548584 seconds
Wrote graph to binary file in 0.026486635208129883 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.858513
After 2 runs, maximum modularity is Q = 0.860467
After 3 runs, maximum modularity is Q = 0.861709
Louvain completed 23 runs in 1.8500421047210693 seconds
PhenoGraph complete in 2.556292772293091 seconds
Found communities [-1, ... 17], with sizes: [240, 359, 201, 139, 72, 72, 61, 61, 51, 51, 47, 37, 36, 34, 34, 15, 15, 13, 12]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2118527889251709 seconds
Jaccard graph constructed in 0.4980463981628418 seconds
Wrote graph to binary file in 0.03323674201965332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.865995
Louvain completed 21 runs in 1.4247725009918213 seconds
PhenoGraph complete in 2.182901382446289 seconds
Found communities [-1, ... 13], with sizes: [194, 449, 240, 177, 77, 62, 50, 49, 48, 48, 41, 39, 32, 32, 12]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21134185791015625 seconds
Jaccard graph constructed in 0.49370455741882324 seconds
Wrote graph to binary file in 0.028309106826782227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.863751
Louvain completed 21 runs in 1.3193938732147217 seconds
PhenoGraph complete in 2.0624914169311523 seconds
Found communities [-1, ... 12], with sizes: [210, 414, 321, 116, 94, 79, 58, 54, 50, 43, 34, 32, 28, 17]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21464848518371582 seconds
Jaccard graph constructed in 0.41809606552124023 seconds
Wrote graph to binary file in 0.03922891616821289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.869476
After 5 runs, maximum modularity is Q = 0.871083
Louvain completed 25 runs in 1.7431824207305908 seconds
PhenoGraph complete in 2.4311368465423584 seconds
Found communities [-1, ... 15], with sizes: [242, 369, 229, 116, 84, 66, 65, 61, 56, 47, 44, 41, 38, 31, 27, 22, 12]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21194982528686523 seconds
Jaccard graph constructed in 0.47103381156921387 seconds
Wrote graph to binary file in 0.027423858642578125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.864778
After 2 runs, maximum modularity is Q = 0.86664
Louvain completed 22 runs in 1.5547842979431152 seconds
PhenoGraph complete in 2.2745959758758545 seconds
Found communities [-1, ... 14], with sizes: [216, 419, 188, 125, 115, 89, 79, 70, 44, 43, 35, 33, 32, 29, 21, 12]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2141737937927246 seconds
Jaccard graph constructed in 0.40814638137817383 seconds
Wrote graph to binary file in 0.04988431930541992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.855954
After 2 runs, maximum modularity is Q = 0.85784
After 3 runs, maximum modularity is Q = 0.858869
After 18 runs, maximum modularity is Q = 0.859884
Louvain completed 38 runs in 2.779226779937744 seconds
PhenoGraph complete in 3.462613821029663 seconds
Found communities [-1, ... 12], with sizes: [262, 342, 223, 162, 104, 83, 76, 76, 70, 39, 39, 35, 26, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21236109733581543 seconds
Jaccard graph constructed in 0.460660457611084 seconds
Wrote graph to binary file in 0.026329755783081055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.864972
Louvain completed 21 runs in 1.2901618480682373 seconds
PhenoGraph complete in 1.9989025592803955 seconds
Found communities [-1, ... 14], with sizes: [243, 447, 347, 72, 69, 59, 48, 45, 43, 39, 34, 32, 29, 15, 14, 14]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2124948501586914 seconds
Jaccard graph constructed in 0.4669172763824463 seconds
Wrote graph to binary file in 0.026116132736206055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.87476
Louvain completed 21 runs in 1.2778146266937256 seconds
PhenoGraph complete in 1.9951555728912354 seconds
Found communities [-1, ... 15], with sizes: [223, 357, 323, 107, 91, 64, 57, 45, 42, 41, 41, 40, 30, 29, 26, 22, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21051359176635742 seconds
Jaccard graph constructed in 0.4531991481781006 seconds
Wrote graph to binary file in 0.2542264461517334 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.866208
Louvain completed 21 runs in 1.2684473991394043 seconds
PhenoGraph complete in 2.1980559825897217 seconds
Found communities [-1, ... 17], with sizes: [188, 458, 218, 88, 80, 63, 53, 53, 47, 43, 42, 40, 36, 29, 29, 26, 24, 18, 15]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2104489803314209 seconds
Jaccard graph constructed in 0.4701554775238037 seconds
Wrote graph to binary file in 0.025290489196777344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.8742
After 2 runs, maximum modularity is Q = 0.875348
Louvain completed 22 runs in 1.5572395324707031 seconds
PhenoGraph complete in 2.274151563644409 seconds
Found communities [-1, ... 18], with sizes: [228, 329, 212, 129, 78, 75, 66, 62, 54, 47, 47, 41, 39, 34, 31, 26, 18, 12, 11, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2123584747314453 seconds
Jaccard graph constructed in 0.4525775909423828 seconds
Wrote graph to binary file in 0.024283647537231445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.875277
Louvain completed 21 runs in 1.306823492050171 seconds
PhenoGraph complete in 2.004891872406006 seconds
Found communities [-1, ... 15], with sizes: [261, 365, 208, 99, 87, 73, 65, 51, 49, 46, 45, 42, 42, 37, 35, 28, 17]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21095514297485352 seconds
Jaccard graph constructed in 0.45581841468811035 seconds
Wrote graph to binary file in 0.024862051010131836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.866302
Louvain completed 21 runs in 1.2976984977722168 seconds
PhenoGraph complete in 2.0001683235168457 seconds
Found communities [-1, ... 16], with sizes: [244, 356, 217, 141, 101, 79, 68, 59, 48, 46, 41, 41, 35, 20, 18, 13, 12, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2138512134552002 seconds
Jaccard graph constructed in 0.4093482494354248 seconds
Wrote graph to binary file in 0.052251577377319336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.862088
Louvain completed 21 runs in 1.2953226566314697 seconds
PhenoGraph complete in 1.9828617572784424 seconds
Found communities [-1, ... 17], with sizes: [222, 409, 203, 169, 77, 73, 61, 54, 50, 46, 34, 31, 30, 20, 17, 17, 13, 12, 12]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2129678726196289 seconds
Jaccard graph constructed in 0.46299242973327637 seconds
Wrote graph to binary file in 0.023497343063354492 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.861761
Louvain completed 21 runs in 1.2902758121490479 seconds
PhenoGraph complete in 1.9996886253356934 seconds
Found communities [-1, ... 16], with sizes: [226, 346, 202, 164, 108, 84, 63, 62, 47, 43, 42, 41, 33, 28, 18, 18, 13, 12]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21132779121398926 seconds
Jaccard graph constructed in 0.4584939479827881 seconds
Wrote graph to binary file in 0.024595975875854492 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.861196
After 5 runs, maximum modularity is Q = 0.86278
Louvain completed 25 runs in 1.7095210552215576 seconds
PhenoGraph complete in 2.4141924381256104 seconds
Found communities [-1, ... 15], with sizes: [221, 407, 211, 105, 102, 82, 76, 67, 51, 44, 41, 41, 34, 26, 19, 12, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21327757835388184 seconds
Jaccard graph constructed in 0.4623739719390869 seconds
Wrote graph to binary file in 0.02399420738220215 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.861037
Louvain completed 21 runs in 1.2983441352844238 seconds
PhenoGraph complete in 2.0079009532928467 seconds
Found communities [-1, ... 15], with sizes: [243, 325, 229, 117, 87, 87, 83, 60, 58, 50, 41, 38, 37, 36, 22, 22, 15]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21152997016906738 seconds
Jaccard graph constructed in 0.46665406227111816 seconds
Wrote graph to binary file in 0.02480936050415039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.863013
Louvain completed 21 runs in 1.2821424007415771 seconds
PhenoGraph complete in 1.995488166809082 seconds
Found communities [-1, ... 14], with sizes: [257, 349, 198, 111, 103, 94, 91, 81, 73, 36, 34, 32, 32, 32, 15, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21063899993896484 seconds
Jaccard graph constructed in 0.45795249938964844 seconds
Wrote graph to binary file in 0.02392888069152832 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.860467
Louvain completed 21 runs in 1.2803103923797607 seconds
PhenoGraph complete in 1.9826774597167969 seconds
Found communities [-1, ... 16], with sizes: [229, 344, 224, 130, 93, 88, 87, 57, 49, 41, 38, 37, 35, 29, 29, 14, 13, 13]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21447324752807617 seconds
Jaccard graph constructed in 0.6688611507415771 seconds
Wrote graph to binary file in 0.03195953369140625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.864132
After 2 runs, maximum modularity is Q = 0.865931
Louvain completed 22 runs in 1.5954487323760986 seconds
PhenoGraph complete in 2.5220508575439453 seconds
Found communities [-1, ... 14], with sizes: [231, 392, 183, 96, 93, 88, 88, 78, 77, 67, 41, 38, 29, 23, 14, 12]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21268606185913086 seconds
Jaccard graph constructed in 0.4194824695587158 seconds
Wrote graph to binary file in 0.029398441314697266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.858512
After 2 runs, maximum modularity is Q = 0.859882
Louvain completed 22 runs in 1.5828559398651123 seconds
PhenoGraph complete in 2.2718589305877686 seconds
Found communities [-1, ... 15], with sizes: [262, 332, 213, 137, 120, 92, 66, 58, 50, 50, 37, 33, 32, 28, 14, 13, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2121415138244629 seconds
Jaccard graph constructed in 0.4728724956512451 seconds
Wrote graph to binary file in 0.025281906127929688 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.85999
After 2 runs, maximum modularity is Q = 0.861145
Louvain completed 22 runs in 1.7052440643310547 seconds
PhenoGraph complete in 2.427703619003296 seconds
Found communities [-1, ... 15], with sizes: [240, 341, 211, 129, 128, 81, 78, 56, 53, 45, 43, 38, 33, 28, 19, 16, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2129504680633545 seconds
Jaccard graph constructed in 0.4776742458343506 seconds
Wrote graph to binary file in 0.023955106735229492 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.863159
Louvain completed 21 runs in 1.314345359802246 seconds
PhenoGraph complete in 2.0375468730926514 seconds
Found communities [-1, ... 15], with sizes: [251, 350, 228, 115, 113, 72, 65, 63, 46, 45, 42, 41, 40, 33, 17, 15, 14]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2104799747467041 seconds
Jaccard graph constructed in 0.4557795524597168 seconds
Wrote graph to binary file in 0.023629426956176758 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.862927
Louvain completed 21 runs in 1.3198494911193848 seconds
PhenoGraph complete in 2.019496202468872 seconds
Found communities [-1, ... 14], with sizes: [267, 446, 210, 98, 83, 80, 62, 56, 48, 44, 42, 34, 31, 22, 14, 13]

In [212]:
sc.pp.normalize_per_cell(D372_Biop_Int1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int1) # log transform the data
D372_Biop_Int1.raw = D372_Biop_Int1 # freeze the object (for later use of the raw state of it)
In [213]:
D372_Biop_Int1 = D372_Biop_Int1[:, D372_Biop_Int1.var['ribo_genes']]
D372_Biop_Int1
Out[213]:
View of AnnData object with n_obs × n_vars = 1240 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [214]:
D372_Biop_Int2 = sc.read_10x_mtx(
    './D372_Biop_Int2/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D372_Biop_Int2.var_names_make_unique()
D372_Biop_Int2.obs['manip'] = 'D372_Biop_Int2'
D372_Biop_Int2.obs['position'] = 'Intermediate'
D372_Biop_Int2.obs['method'] = 'Biopsy'
D372_Biop_Int2.obs['donor'] = 'D372'
D372_Biop_Int2.obs['name'] = ['D372_Biop_Int2_' + s for s in list(D372_Biop_Int2.obs.index)]
D372_Biop_Int2.obs_names = D372_Biop_Int2.obs['name']
D372_Biop_Int2
... reading from cache file ./cache/D372_Biop_Int2-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[214]:
AnnData object with n_obs × n_vars = 4003 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [215]:
sc.pl.highest_expr_genes(D372_Biop_Int2, n_top=20)
In [216]:
sc.pp.filter_cells(D372_Biop_Int2, min_genes=0)
mito_genes = D372_Biop_Int2.var_names.str.startswith('MT-')
D372_Biop_Int2.obs['percent_mito'] = np.sum(
    D372_Biop_Int2[:, mito_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.obs['n_counts'] = D372_Biop_Int2.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Biop_Int2.to_df())
ribo_genes = D372_Biop_Int2.to_df().columns.isin(RB_genes_in_df)
D372_Biop_Int2.obs['percent_ribo'] = np.sum(
    D372_Biop_Int2[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Biop_Int2.X, axis=1).A1
D372_Biop_Int2.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D372_Biop_Int2, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [217]:
sc.pp.filter_cells(D372_Biop_Int2, min_genes=500)
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['n_counts'] < 20000, :]
D372_Biop_Int2 = D372_Biop_Int2[D372_Biop_Int2.obs['percent_mito'] < 0.2, :]
filtered out 25 cells that have less than 500 genes expressed
In [218]:
# scrublet
scrub = scr.Scrublet(D372_Biop_Int2.X, expected_doublet_rate=0.031)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Biop_Int2.obs['doublet_scores'] = doublet_scores
D372_Biop_Int2.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.25
Detected doublet rate = 1.3%
Estimated detectable doublet fraction = 27.9%
Overall doublet rate:
	Expected   = 3.1%
	Estimated  = 4.8%
Elapsed time: 2.9 seconds
Out[218]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ecbb42898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1901a20>],
       dtype=object))
In [219]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Biop_Int2.X).predict()
D372_Biop_Int2.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8117721080780029 seconds
Jaccard graph constructed in 1.0435481071472168 seconds
Wrote graph to binary file in 0.08883523941040039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935617
After 11 runs, maximum modularity is Q = 0.93678
Louvain completed 31 runs in 2.9441475868225098 seconds
PhenoGraph complete in 4.906459808349609 seconds
Found communities [-1, ... 29], with sizes: [168, 849, 797, 383, 343, 269, 268, 243, 240, 215, 120, 113, 98, 91, 79, 78, 74, 62, 61, 55, 50, 49, 44, 36, 35, 34, 32, 17, 14, 13, 13]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8170309066772461 seconds
Jaccard graph constructed in 0.7652695178985596 seconds
Wrote graph to binary file in 0.3165557384490967 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934101
Louvain completed 21 runs in 1.9748854637145996 seconds
PhenoGraph complete in 3.891366481781006 seconds
Found communities [-1, ... 28], with sizes: [174, 1031, 671, 398, 309, 265, 251, 209, 191, 157, 142, 137, 109, 91, 87, 83, 80, 79, 62, 60, 59, 59, 52, 42, 38, 36, 30, 17, 13, 11]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8148114681243896 seconds
Jaccard graph constructed in 0.7761123180389404 seconds
Wrote graph to binary file in 0.0947563648223877 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934828
After 14 runs, maximum modularity is Q = 0.936188
Louvain completed 34 runs in 3.187971830368042 seconds
PhenoGraph complete in 4.892239332199097 seconds
Found communities [-1, ... 30], with sizes: [165, 913, 866, 449, 286, 231, 228, 225, 165, 134, 120, 120, 117, 89, 81, 74, 71, 65, 64, 60, 57, 55, 54, 47, 38, 34, 32, 30, 29, 18, 14, 12]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.2125005722045898 seconds
Jaccard graph constructed in 0.7652847766876221 seconds
Wrote graph to binary file in 0.0940396785736084 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935108
Louvain completed 21 runs in 1.900214433670044 seconds
PhenoGraph complete in 3.9911155700683594 seconds
Found communities [-1, ... 27], with sizes: [161, 1628, 342, 311, 287, 259, 226, 211, 195, 193, 170, 116, 89, 85, 77, 72, 65, 60, 57, 57, 56, 50, 40, 32, 31, 31, 17, 14, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.221062421798706 seconds
Jaccard graph constructed in 1.015894889831543 seconds
Wrote graph to binary file in 0.09170222282409668 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934143
Louvain completed 21 runs in 1.893204689025879 seconds
PhenoGraph complete in 4.241094350814819 seconds
Found communities [-1, ... 28], with sizes: [143, 905, 868, 467, 337, 234, 230, 220, 165, 126, 116, 108, 99, 96, 89, 78, 75, 69, 65, 65, 61, 61, 50, 46, 45, 39, 31, 27, 17, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.213078498840332 seconds
Jaccard graph constructed in 0.7669408321380615 seconds
Wrote graph to binary file in 0.09694147109985352 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933835
Louvain completed 21 runs in 2.009591817855835 seconds
PhenoGraph complete in 4.103895425796509 seconds
Found communities [-1, ... 28], with sizes: [167, 914, 814, 468, 353, 263, 254, 198, 197, 167, 121, 114, 111, 87, 86, 86, 69, 63, 61, 52, 44, 43, 40, 32, 31, 29, 28, 25, 15, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9172782897949219 seconds
Jaccard graph constructed in 0.9897208213806152 seconds
Wrote graph to binary file in 0.09256529808044434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.93334
Louvain completed 21 runs in 1.8727903366088867 seconds
PhenoGraph complete in 3.887312889099121 seconds
Found communities [-1, ... 28], with sizes: [132, 1632, 325, 279, 259, 258, 255, 221, 217, 206, 120, 108, 89, 86, 78, 77, 67, 63, 61, 58, 51, 51, 43, 35, 33, 33, 31, 29, 29, 17]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8165619373321533 seconds
Jaccard graph constructed in 0.7624201774597168 seconds
Wrote graph to binary file in 0.30907201766967773 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935205
After 10 runs, maximum modularity is Q = 0.936348
Louvain completed 30 runs in 2.844038724899292 seconds
PhenoGraph complete in 4.750696659088135 seconds
Found communities [-1, ... 28], with sizes: [145, 876, 780, 453, 365, 285, 249, 247, 198, 140, 122, 112, 108, 90, 88, 87, 79, 78, 63, 52, 52, 50, 46, 35, 35, 33, 31, 17, 16, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9124925136566162 seconds
Jaccard graph constructed in 0.7728259563446045 seconds
Wrote graph to binary file in 0.09240317344665527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933658
Louvain completed 21 runs in 1.9913523197174072 seconds
PhenoGraph complete in 3.7952184677124023 seconds
Found communities [-1, ... 26], with sizes: [158, 864, 857, 491, 461, 383, 215, 191, 133, 114, 114, 104, 88, 78, 77, 75, 69, 63, 60, 60, 54, 54, 46, 39, 36, 29, 19, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.012800931930542 seconds
Jaccard graph constructed in 0.7807364463806152 seconds
Wrote graph to binary file in 0.35766029357910156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.938331
Louvain completed 21 runs in 1.9387297630310059 seconds
PhenoGraph complete in 4.108801603317261 seconds
Found communities [-1, ... 28], with sizes: [199, 946, 825, 429, 320, 289, 248, 215, 163, 122, 109, 105, 101, 88, 81, 78, 75, 71, 61, 59, 56, 52, 50, 45, 42, 36, 33, 20, 13, 12]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9129884243011475 seconds
Jaccard graph constructed in 0.7538411617279053 seconds
Wrote graph to binary file in 0.09158706665039062 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935929
Louvain completed 21 runs in 1.9764914512634277 seconds
PhenoGraph complete in 3.7522823810577393 seconds
Found communities [-1, ... 30], with sizes: [145, 886, 877, 342, 284, 252, 242, 221, 199, 195, 125, 115, 103, 99, 89, 77, 76, 74, 65, 61, 57, 56, 51, 50, 40, 37, 32, 31, 18, 17, 14, 13]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.821117639541626 seconds
Jaccard graph constructed in 0.7486910820007324 seconds
Wrote graph to binary file in 0.3156290054321289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935243
Louvain completed 21 runs in 1.900001049041748 seconds
PhenoGraph complete in 3.801069974899292 seconds
Found communities [-1, ... 28], with sizes: [140, 875, 864, 348, 278, 277, 249, 241, 235, 206, 136, 123, 121, 89, 77, 76, 69, 61, 59, 58, 57, 56, 50, 45, 37, 32, 30, 20, 18, 16]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.027388334274292 seconds
Jaccard graph constructed in 0.7994790077209473 seconds
Wrote graph to binary file in 0.10189247131347656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933372
After 15 runs, maximum modularity is Q = 0.934527
Louvain completed 35 runs in 3.5184009075164795 seconds
PhenoGraph complete in 5.469161033630371 seconds
Found communities [-1, ... 28], with sizes: [175, 899, 864, 434, 327, 247, 240, 236, 196, 124, 124, 107, 95, 93, 84, 80, 75, 64, 61, 59, 59, 55, 52, 47, 34, 32, 32, 19, 17, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0228221416473389 seconds
Jaccard graph constructed in 0.7615830898284912 seconds
Wrote graph to binary file in 0.09098577499389648 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933012
After 2 runs, maximum modularity is Q = 0.934076
Louvain completed 22 runs in 2.399656057357788 seconds
PhenoGraph complete in 4.292960166931152 seconds
Found communities [-1, ... 27], with sizes: [149, 947, 790, 474, 448, 286, 261, 229, 125, 118, 118, 112, 96, 88, 78, 76, 67, 61, 60, 59, 57, 49, 40, 32, 31, 29, 27, 25, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8194582462310791 seconds
Jaccard graph constructed in 1.0443758964538574 seconds
Wrote graph to binary file in 0.09239006042480469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.93494
Louvain completed 21 runs in 1.9768857955932617 seconds
PhenoGraph complete in 3.9492082595825195 seconds
Found communities [-1, ... 28], with sizes: [178, 941, 776, 464, 442, 363, 240, 228, 116, 116, 99, 87, 84, 84, 83, 80, 78, 61, 53, 53, 52, 46, 38, 36, 35, 32, 30, 25, 12, 11]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0186831951141357 seconds
Jaccard graph constructed in 0.7586953639984131 seconds
Wrote graph to binary file in 0.0899960994720459 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.936074
After 7 runs, maximum modularity is Q = 0.937143
Louvain completed 27 runs in 2.5816171169281006 seconds
PhenoGraph complete in 4.467833757400513 seconds
Found communities [-1, ... 28], with sizes: [182, 935, 786, 422, 322, 266, 242, 235, 229, 128, 126, 115, 89, 85, 85, 83, 75, 72, 62, 61, 57, 56, 46, 38, 37, 34, 33, 17, 14, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9122560024261475 seconds
Jaccard graph constructed in 1.0107040405273438 seconds
Wrote graph to binary file in 0.0894930362701416 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.93546
Louvain completed 21 runs in 1.9582693576812744 seconds
PhenoGraph complete in 3.987028121948242 seconds
Found communities [-1, ... 31], with sizes: [182, 856, 850, 482, 374, 302, 299, 226, 115, 107, 106, 90, 87, 83, 82, 79, 77, 63, 59, 51, 45, 45, 41, 36, 34, 32, 28, 28, 24, 19, 16, 13, 12]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8124711513519287 seconds
Jaccard graph constructed in 0.7607028484344482 seconds
Wrote graph to binary file in 0.3139994144439697 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933227
Louvain completed 21 runs in 1.9329650402069092 seconds
PhenoGraph complete in 3.838634490966797 seconds
Found communities [-1, ... 27], with sizes: [150, 932, 844, 357, 279, 266, 251, 232, 225, 202, 194, 121, 103, 91, 81, 78, 66, 61, 56, 53, 48, 42, 41, 36, 34, 33, 25, 22, 20]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0123755931854248 seconds
Jaccard graph constructed in 0.7377088069915771 seconds
Wrote graph to binary file in 0.09003496170043945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.93549
Louvain completed 21 runs in 1.9168570041656494 seconds
PhenoGraph complete in 3.7733287811279297 seconds
Found communities [-1, ... 28], with sizes: [178, 878, 847, 433, 341, 296, 254, 239, 161, 136, 121, 119, 105, 90, 79, 76, 67, 65, 61, 56, 53, 52, 47, 46, 35, 32, 31, 17, 16, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.81166672706604 seconds
Jaccard graph constructed in 0.7489631175994873 seconds
Wrote graph to binary file in 0.0907747745513916 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934919
After 5 runs, maximum modularity is Q = 0.936508
Louvain completed 25 runs in 2.493182897567749 seconds
PhenoGraph complete in 4.162370681762695 seconds
Found communities [-1, ... 27], with sizes: [162, 933, 818, 431, 379, 329, 256, 243, 183, 129, 120, 102, 91, 90, 80, 78, 75, 61, 51, 43, 39, 39, 38, 35, 33, 31, 30, 25, 19]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9126756191253662 seconds
Jaccard graph constructed in 0.9736266136169434 seconds
Wrote graph to binary file in 0.09059786796569824 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.933951
After 3 runs, maximum modularity is Q = 0.935124
Louvain completed 23 runs in 2.36483097076416 seconds
PhenoGraph complete in 4.35871696472168 seconds
Found communities [-1, ... 27], with sizes: [169, 947, 786, 353, 288, 269, 262, 247, 225, 210, 122, 114, 104, 89, 88, 81, 80, 74, 60, 54, 54, 46, 46, 41, 37, 36, 33, 17, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.011352777481079 seconds
Jaccard graph constructed in 0.7464473247528076 seconds
Wrote graph to binary file in 0.3128812313079834 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.936431
Louvain completed 21 runs in 1.9111721515655518 seconds
PhenoGraph complete in 4.000088691711426 seconds
Found communities [-1, ... 26], with sizes: [167, 913, 832, 460, 455, 312, 252, 237, 220, 113, 110, 97, 87, 77, 77, 74, 60, 55, 54, 49, 47, 37, 36, 33, 32, 27, 17, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8121376037597656 seconds
Jaccard graph constructed in 0.7482364177703857 seconds
Wrote graph to binary file in 0.08965182304382324 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934755
Louvain completed 21 runs in 1.8797099590301514 seconds
PhenoGraph complete in 3.5465455055236816 seconds
Found communities [-1, ... 29], with sizes: [173, 891, 853, 331, 279, 265, 264, 257, 237, 216, 138, 116, 109, 92, 79, 74, 71, 64, 61, 54, 49, 42, 34, 33, 31, 30, 30, 27, 19, 13, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.912355899810791 seconds
Jaccard graph constructed in 0.7591440677642822 seconds
Wrote graph to binary file in 0.09034132957458496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.935543
Louvain completed 21 runs in 1.9075205326080322 seconds
PhenoGraph complete in 3.6860432624816895 seconds
Found communities [-1, ... 28], with sizes: [125, 909, 860, 343, 296, 259, 242, 232, 230, 222, 125, 113, 111, 90, 85, 82, 78, 70, 61, 59, 57, 55, 44, 41, 41, 39, 31, 18, 14, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0124380588531494 seconds
Jaccard graph constructed in 0.7433838844299316 seconds
Wrote graph to binary file in 0.3591461181640625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.934186
Louvain completed 21 runs in 1.9338464736938477 seconds
PhenoGraph complete in 4.067973852157593 seconds
Found communities [-1, ... 27], with sizes: [146, 901, 852, 446, 302, 245, 244, 215, 191, 131, 128, 127, 112, 107, 93, 81, 80, 74, 64, 63, 62, 55, 48, 41, 38, 36, 31, 19, 11]

In [220]:
sc.pp.normalize_per_cell(D372_Biop_Int2, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Biop_Int2) # log transform the data
D372_Biop_Int2.raw = D372_Biop_Int2 # freeze the object (for later use of the raw state of it)
In [221]:
D372_Biop_Int2 = D372_Biop_Int2[:, D372_Biop_Int2.var['ribo_genes']]
D372_Biop_Int2
Out[221]:
View of AnnData object with n_obs × n_vars = 3955 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

Distal Brushings

Back to top

In [222]:
D326_Brus_Dis1 = sc.read_10x_mtx(
    './D326_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D326_Brus_Dis1.var_names_make_unique()
D326_Brus_Dis1.obs['manip'] = 'D326_Brus_Dis1'
D326_Brus_Dis1.obs['position'] = 'Distal'
D326_Brus_Dis1.obs['method'] = 'Brushing'
D326_Brus_Dis1.obs['donor'] = 'D326'
D326_Brus_Dis1.obs['name'] = ['D326_Brus_Dis1_' + s for s in list(D326_Brus_Dis1.obs.index)]
D326_Brus_Dis1.obs_names = D326_Brus_Dis1.obs['name']
D326_Brus_Dis1
... reading from cache file ./cache/D326_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[222]:
AnnData object with n_obs × n_vars = 1250 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [223]:
sc.pl.highest_expr_genes(D326_Brus_Dis1, n_top=20)
In [224]:
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=0)
mito_genes = D326_Brus_Dis1.var_names.str.startswith('MT-')
D326_Brus_Dis1.obs['percent_mito'] = np.sum(
    D326_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.obs['n_counts'] = D326_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D326_Brus_Dis1.to_df())
ribo_genes = D326_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D326_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D326_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D326_Brus_Dis1.X, axis=1).A1
D326_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D326_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [225]:
sc.pp.filter_cells(D326_Brus_Dis1, min_genes=500)
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['n_counts'] < 15000, :]
D326_Brus_Dis1 = D326_Brus_Dis1[D326_Brus_Dis1.obs['percent_mito'] < 0.25 , :]
filtered out 126 cells that have less than 500 genes expressed
In [226]:
# scrublet
scrub = scr.Scrublet(D326_Brus_Dis1.X, expected_doublet_rate=0.01)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D326_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D326_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.09
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 32.9%
Overall doublet rate:
	Expected   = 1.0%
	Estimated  = 1.4%
Elapsed time: 0.6 seconds
Out[226]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea7e52160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea08d2d68>],
       dtype=object))
In [227]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D326_Brus_Dis1.X).predict()
D326_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21236729621887207 seconds
Jaccard graph constructed in 0.3849825859069824 seconds
Wrote graph to binary file in 0.026634931564331055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885995
After 2 runs, maximum modularity is Q = 0.887532
Louvain completed 22 runs in 1.5970864295959473 seconds
PhenoGraph complete in 2.232036828994751 seconds
Found communities [-1, ... 15], with sizes: [175, 276, 195, 102, 99, 82, 76, 63, 57, 49, 48, 40, 31, 30, 25, 22, 22]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21405839920043945 seconds
Jaccard graph constructed in 0.45954298973083496 seconds
Wrote graph to binary file in 0.02256298065185547 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882894
Louvain completed 21 runs in 1.3038721084594727 seconds
PhenoGraph complete in 2.0111072063446045 seconds
Found communities [-1, ... 17], with sizes: [160, 222, 196, 95, 82, 81, 74, 66, 61, 59, 56, 50, 35, 33, 31, 29, 23, 22, 17]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21121788024902344 seconds
Jaccard graph constructed in 0.38576459884643555 seconds
Wrote graph to binary file in 0.329129695892334 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.879739
Louvain completed 21 runs in 1.3448681831359863 seconds
PhenoGraph complete in 2.2883236408233643 seconds
Found communities [-1, ... 16], with sizes: [160, 283, 200, 118, 108, 100, 69, 61, 46, 41, 36, 31, 29, 25, 24, 23, 21, 17]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2129228115081787 seconds
Jaccard graph constructed in 0.4608762264251709 seconds
Wrote graph to binary file in 0.026796817779541016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882111
After 2 runs, maximum modularity is Q = 0.883529
After 3 runs, maximum modularity is Q = 0.884916
Louvain completed 23 runs in 1.8575758934020996 seconds
PhenoGraph complete in 2.567722797393799 seconds
Found communities [-1, ... 17], with sizes: [142, 267, 165, 116, 92, 88, 75, 71, 63, 61, 60, 56, 26, 26, 26, 15, 15, 15, 13]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10753631591796875 seconds
Jaccard graph constructed in 0.41381263732910156 seconds
Wrote graph to binary file in 0.04300236701965332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885981
After 2 runs, maximum modularity is Q = 0.887036
Louvain completed 22 runs in 1.5978055000305176 seconds
PhenoGraph complete in 2.1889121532440186 seconds
Found communities [-1, ... 17], with sizes: [147, 278, 164, 110, 88, 88, 69, 59, 59, 55, 46, 41, 38, 32, 32, 27, 25, 21, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11262702941894531 seconds
Jaccard graph constructed in 0.4353642463684082 seconds
Wrote graph to binary file in 0.06667590141296387 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883925
Louvain completed 21 runs in 1.3180835247039795 seconds
PhenoGraph complete in 1.943319320678711 seconds
Found communities [-1, ... 17], with sizes: [150, 259, 187, 121, 91, 87, 86, 62, 58, 46, 45, 35, 33, 27, 24, 23, 20, 19, 19]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11774826049804688 seconds
Jaccard graph constructed in 0.4601624011993408 seconds
Wrote graph to binary file in 0.029280662536621094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89027
Louvain completed 21 runs in 1.2568957805633545 seconds
PhenoGraph complete in 1.873772144317627 seconds
Found communities [-1, ... 19], with sizes: [164, 242, 191, 87, 83, 83, 71, 66, 61, 55, 52, 48, 36, 29, 24, 22, 20, 18, 15, 14, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10627269744873047 seconds
Jaccard graph constructed in 0.48020219802856445 seconds
Wrote graph to binary file in 0.030440092086791992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890978
Louvain completed 21 runs in 1.3140740394592285 seconds
PhenoGraph complete in 1.944899082183838 seconds
Found communities [-1, ... 18], with sizes: [155, 256, 228, 90, 86, 78, 72, 67, 60, 52, 48, 39, 32, 29, 21, 20, 18, 16, 14, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21527576446533203 seconds
Jaccard graph constructed in 0.40128302574157715 seconds
Wrote graph to binary file in 0.04050946235656738 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885462
Louvain completed 21 runs in 1.3291218280792236 seconds
PhenoGraph complete in 2.0129151344299316 seconds
Found communities [-1, ... 19], with sizes: [150, 246, 200, 95, 82, 74, 70, 60, 58, 57, 50, 41, 35, 32, 30, 28, 21, 21, 15, 14, 13]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11005902290344238 seconds
Jaccard graph constructed in 0.482438325881958 seconds
Wrote graph to binary file in 0.029459238052368164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885172
Louvain completed 21 runs in 1.3148517608642578 seconds
PhenoGraph complete in 1.9467573165893555 seconds
Found communities [-1, ... 17], with sizes: [174, 274, 194, 92, 76, 75, 72, 66, 64, 59, 57, 41, 31, 27, 25, 21, 15, 15, 14]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2090897560119629 seconds
Jaccard graph constructed in 0.4054684638977051 seconds
Wrote graph to binary file in 0.05972576141357422 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884556
Louvain completed 21 runs in 1.3259060382843018 seconds
PhenoGraph complete in 2.0101284980773926 seconds
Found communities [-1, ... 19], with sizes: [157, 268, 196, 98, 76, 65, 65, 56, 56, 48, 43, 35, 34, 33, 31, 30, 29, 21, 20, 16, 15]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11155295372009277 seconds
Jaccard graph constructed in 0.39594268798828125 seconds
Wrote graph to binary file in 0.28209710121154785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886023
Louvain completed 21 runs in 1.323824167251587 seconds
PhenoGraph complete in 2.1238250732421875 seconds
Found communities [-1, ... 17], with sizes: [169, 251, 155, 119, 90, 89, 85, 84, 58, 57, 49, 33, 30, 28, 23, 22, 21, 16, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11206221580505371 seconds
Jaccard graph constructed in 0.45722389221191406 seconds
Wrote graph to binary file in 0.028032779693603516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.881474
After 5 runs, maximum modularity is Q = 0.882949
Louvain completed 25 runs in 1.755934715270996 seconds
PhenoGraph complete in 2.3644959926605225 seconds
Found communities [-1, ... 16], with sizes: [145, 285, 189, 111, 93, 89, 73, 63, 60, 56, 47, 41, 31, 29, 25, 21, 18, 16]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11324906349182129 seconds
Jaccard graph constructed in 0.40363240242004395 seconds
Wrote graph to binary file in 0.050939321517944336 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886927
Louvain completed 21 runs in 1.3983840942382812 seconds
PhenoGraph complete in 1.9748764038085938 seconds
Found communities [-1, ... 16], with sizes: [183, 275, 208, 122, 62, 62, 57, 56, 54, 52, 47, 42, 34, 31, 29, 29, 27, 22]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11382365226745605 seconds
Jaccard graph constructed in 0.46040773391723633 seconds
Wrote graph to binary file in 0.027059555053710938 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882505
After 2 runs, maximum modularity is Q = 0.885266
Louvain completed 22 runs in 1.598015546798706 seconds
PhenoGraph complete in 2.2085375785827637 seconds
Found communities [-1, ... 17], with sizes: [166, 271, 208, 112, 86, 84, 65, 59, 58, 42, 40, 37, 29, 29, 29, 24, 24, 15, 14]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11151361465454102 seconds
Jaccard graph constructed in 0.5021193027496338 seconds
Wrote graph to binary file in 0.031245946884155273 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891541
Louvain completed 21 runs in 1.3343415260314941 seconds
PhenoGraph complete in 1.991173267364502 seconds
Found communities [-1, ... 19], with sizes: [140, 249, 172, 103, 86, 80, 59, 58, 54, 52, 51, 44, 43, 33, 31, 31, 30, 24, 21, 20, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.113037109375 seconds
Jaccard graph constructed in 0.45372676849365234 seconds
Wrote graph to binary file in 0.026300907135009766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883527
Louvain completed 21 runs in 1.3329782485961914 seconds
PhenoGraph complete in 1.9335401058197021 seconds
Found communities [-1, ... 18], with sizes: [178, 198, 186, 93, 89, 82, 75, 66, 64, 61, 57, 55, 39, 29, 27, 24, 23, 17, 16, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1065073013305664 seconds
Jaccard graph constructed in 0.4500918388366699 seconds
Wrote graph to binary file in 0.04378342628479004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885788
Louvain completed 21 runs in 1.5246539115905762 seconds
PhenoGraph complete in 2.1452653408050537 seconds
Found communities [-1, ... 20], with sizes: [168, 240, 168, 104, 101, 84, 81, 58, 53, 53, 47, 33, 30, 30, 29, 23, 19, 17, 17, 13, 13, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10833430290222168 seconds
Jaccard graph constructed in 0.40520429611206055 seconds
Wrote graph to binary file in 0.040392160415649414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886998
Louvain completed 21 runs in 1.3182458877563477 seconds
PhenoGraph complete in 1.8945374488830566 seconds
Found communities [-1, ... 17], with sizes: [145, 265, 132, 92, 85, 82, 79, 70, 61, 59, 52, 46, 44, 38, 33, 32, 32, 23, 22]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1098332405090332 seconds
Jaccard graph constructed in 0.41411828994750977 seconds
Wrote graph to binary file in 0.03911447525024414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888591
Louvain completed 21 runs in 1.5329885482788086 seconds
PhenoGraph complete in 2.1093666553497314 seconds
Found communities [-1, ... 18], with sizes: [166, 233, 175, 85, 83, 75, 75, 73, 60, 54, 48, 41, 40, 38, 31, 30, 28, 22, 22, 13]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10961723327636719 seconds
Jaccard graph constructed in 0.5002346038818359 seconds
Wrote graph to binary file in 0.033417463302612305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886099
Louvain completed 21 runs in 1.5221836566925049 seconds
PhenoGraph complete in 2.1778619289398193 seconds
Found communities [-1, ... 17], with sizes: [160, 247, 189, 98, 91, 85, 75, 60, 55, 50, 49, 41, 37, 35, 32, 30, 24, 20, 14]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10643386840820312 seconds
Jaccard graph constructed in 0.40942883491516113 seconds
Wrote graph to binary file in 0.2884066104888916 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884044
After 5 runs, maximum modularity is Q = 0.885202
Louvain completed 25 runs in 1.7502760887145996 seconds
PhenoGraph complete in 2.568711519241333 seconds
Found communities [-1, ... 16], with sizes: [161, 255, 229, 114, 81, 70, 65, 61, 57, 53, 44, 40, 35, 33, 27, 24, 24, 19]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1112668514251709 seconds
Jaccard graph constructed in 0.45477819442749023 seconds
Wrote graph to binary file in 0.028890132904052734 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886769
After 9 runs, maximum modularity is Q = 0.887841
Louvain completed 29 runs in 1.9556076526641846 seconds
PhenoGraph complete in 2.5612595081329346 seconds
Found communities [-1, ... 19], with sizes: [147, 276, 154, 86, 82, 71, 59, 58, 55, 53, 50, 44, 43, 38, 36, 31, 28, 27, 25, 15, 14]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11242341995239258 seconds
Jaccard graph constructed in 0.3963158130645752 seconds
Wrote graph to binary file in 0.06721639633178711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885387
Louvain completed 21 runs in 1.337975025177002 seconds
PhenoGraph complete in 1.9228894710540771 seconds
Found communities [-1, ... 20], with sizes: [162, 251, 196, 84, 82, 62, 57, 55, 54, 50, 48, 48, 46, 34, 32, 31, 22, 20, 20, 16, 11, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11513066291809082 seconds
Jaccard graph constructed in 0.48522496223449707 seconds
Wrote graph to binary file in 0.028902292251586914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890073
Louvain completed 21 runs in 1.5114171504974365 seconds
PhenoGraph complete in 2.151106119155884 seconds
Found communities [-1, ... 15], with sizes: [150, 289, 197, 98, 94, 91, 89, 59, 55, 55, 45, 39, 30, 28, 27, 24, 22]

In [228]:
sc.pp.normalize_per_cell(D326_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D326_Brus_Dis1) # log transform the data
D326_Brus_Dis1.raw = D326_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [229]:
D326_Brus_Dis1 = D326_Brus_Dis1[:, D326_Brus_Dis1.var['ribo_genes']]
D326_Brus_Dis1
Out[229]:
View of AnnData object with n_obs × n_vars = 1114 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [230]:
D337_Brus_Dis1 = sc.read_10x_mtx(
    './D337_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D337_Brus_Dis1.var_names_make_unique()
D337_Brus_Dis1.obs['manip'] = 'D337_Brus_Dis1'
D337_Brus_Dis1.obs['position'] = 'Distal'
D337_Brus_Dis1.obs['method'] = 'Brushing'
D337_Brus_Dis1.obs['donor'] = 'D337'
D337_Brus_Dis1.obs['name'] = ['D337_Brus_Dis1' + s for s in list(D337_Brus_Dis1.obs.index)]
D337_Brus_Dis1.obs_names = D337_Brus_Dis1.obs['name']
D337_Brus_Dis1
... reading from cache file ./cache/D337_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[230]:
AnnData object with n_obs × n_vars = 1428 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [231]:
sc.pl.highest_expr_genes(D337_Brus_Dis1, n_top=20)
In [232]:
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=0)
mito_genes = D337_Brus_Dis1.var_names.str.startswith('MT-')
D337_Brus_Dis1.obs['percent_mito'] = np.sum(
    D337_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.obs['n_counts'] = D337_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D337_Brus_Dis1.to_df())
ribo_genes = D337_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D337_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D337_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D337_Brus_Dis1.X, axis=1).A1
D337_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D337_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [233]:
sc.pp.filter_cells(D337_Brus_Dis1, min_genes=500)
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['n_counts'] < 30000, :]
D337_Brus_Dis1 = D337_Brus_Dis1[D337_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 17 cells that have less than 500 genes expressed
In [234]:
# scrublet
scrub = scr.Scrublet(D337_Brus_Dis1.X, expected_doublet_rate=0.012)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D337_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D337_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.12
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 40.2%
Overall doublet rate:
	Expected   = 1.2%
	Estimated  = 1.1%
Elapsed time: 1.0 seconds
Out[234]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea294a2b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea13a24a8>],
       dtype=object))
In [235]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D337_Brus_Dis1.X).predict()
D337_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21335673332214355 seconds
Jaccard graph constructed in 0.4331936836242676 seconds
Wrote graph to binary file in 0.04625391960144043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889309
Louvain completed 21 runs in 1.4623045921325684 seconds
PhenoGraph complete in 2.1702306270599365 seconds
Found communities [-1, ... 17], with sizes: [280, 286, 234, 125, 122, 108, 95, 82, 73, 67, 53, 41, 33, 33, 31, 25, 24, 20, 15]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21367979049682617 seconds
Jaccard graph constructed in 0.46930909156799316 seconds
Wrote graph to binary file in 0.3069896697998047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890979
After 8 runs, maximum modularity is Q = 0.892065
Louvain completed 28 runs in 2.011324167251587 seconds
PhenoGraph complete in 3.012248992919922 seconds
Found communities [-1, ... 16], with sizes: [272, 315, 293, 163, 132, 85, 83, 69, 65, 53, 49, 31, 30, 30, 24, 23, 19, 11]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21150565147399902 seconds
Jaccard graph constructed in 0.5038797855377197 seconds
Wrote graph to binary file in 0.040128469467163086 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891754
Louvain completed 21 runs in 1.4095039367675781 seconds
PhenoGraph complete in 2.1780807971954346 seconds
Found communities [-1, ... 17], with sizes: [240, 351, 215, 128, 119, 100, 92, 72, 71, 67, 54, 53, 34, 32, 31, 26, 24, 22, 16]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21395373344421387 seconds
Jaccard graph constructed in 0.46888303756713867 seconds
Wrote graph to binary file in 0.03740644454956055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886129
After 15 runs, maximum modularity is Q = 0.887213
Louvain completed 35 runs in 2.362523078918457 seconds
PhenoGraph complete in 3.094099283218384 seconds
Found communities [-1, ... 14], with sizes: [327, 292, 274, 160, 136, 99, 85, 70, 67, 54, 48, 38, 29, 25, 24, 19]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21154189109802246 seconds
Jaccard graph constructed in 0.4939708709716797 seconds
Wrote graph to binary file in 0.03615260124206543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891106
After 2 runs, maximum modularity is Q = 0.89315
Louvain completed 22 runs in 1.6358544826507568 seconds
PhenoGraph complete in 2.3885648250579834 seconds
Found communities [-1, ... 17], with sizes: [301, 306, 264, 126, 98, 87, 78, 72, 67, 63, 61, 50, 47, 35, 29, 25, 16, 11, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2123885154724121 seconds
Jaccard graph constructed in 0.5010015964508057 seconds
Wrote graph to binary file in 0.0345149040222168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88907
Louvain completed 21 runs in 1.3986222743988037 seconds
PhenoGraph complete in 2.1557860374450684 seconds
Found communities [-1, ... 16], with sizes: [303, 328, 264, 157, 139, 76, 70, 60, 55, 52, 50, 47, 30, 30, 28, 21, 19, 18]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21259760856628418 seconds
Jaccard graph constructed in 0.48282337188720703 seconds
Wrote graph to binary file in 0.036559104919433594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888333
After 3 runs, maximum modularity is Q = 0.890067
Louvain completed 23 runs in 1.7427825927734375 seconds
PhenoGraph complete in 2.4860239028930664 seconds
Found communities [-1, ... 16], with sizes: [291, 341, 270, 150, 132, 86, 79, 72, 68, 59, 54, 28, 25, 23, 23, 18, 16, 12]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21264147758483887 seconds
Jaccard graph constructed in 0.48983144760131836 seconds
Wrote graph to binary file in 0.03361320495605469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889108
After 3 runs, maximum modularity is Q = 0.890511
Louvain completed 23 runs in 1.7381327152252197 seconds
PhenoGraph complete in 2.4849116802215576 seconds
Found communities [-1, ... 14], with sizes: [347, 313, 280, 136, 123, 104, 86, 65, 57, 53, 48, 33, 30, 29, 28, 15]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21393513679504395 seconds
Jaccard graph constructed in 0.471268892288208 seconds
Wrote graph to binary file in 0.03370261192321777 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88863
Louvain completed 21 runs in 1.3928608894348145 seconds
PhenoGraph complete in 2.1209239959716797 seconds
Found communities [-1, ... 16], with sizes: [293, 425, 192, 165, 123, 80, 67, 66, 59, 50, 43, 41, 32, 30, 30, 25, 14, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2122182846069336 seconds
Jaccard graph constructed in 0.48700690269470215 seconds
Wrote graph to binary file in 0.2865562438964844 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891116
Louvain completed 21 runs in 1.3843319416046143 seconds
PhenoGraph complete in 2.383152484893799 seconds
Found communities [-1, ... 17], with sizes: [273, 318, 285, 131, 105, 95, 71, 66, 66, 56, 50, 47, 32, 32, 31, 28, 24, 22, 15]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21279048919677734 seconds
Jaccard graph constructed in 0.47730088233947754 seconds
Wrote graph to binary file in 0.033003807067871094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888836
After 4 runs, maximum modularity is Q = 0.89014
Louvain completed 24 runs in 1.8066763877868652 seconds
PhenoGraph complete in 2.5400307178497314 seconds
Found communities [-1, ... 14], with sizes: [311, 365, 263, 144, 134, 84, 67, 61, 60, 58, 49, 35, 35, 32, 28, 21]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21146702766418457 seconds
Jaccard graph constructed in 0.48106908798217773 seconds
Wrote graph to binary file in 0.034003257751464844 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887739
After 2 runs, maximum modularity is Q = 0.889539
Louvain completed 22 runs in 1.6745891571044922 seconds
PhenoGraph complete in 2.4130375385284424 seconds
Found communities [-1, ... 14], with sizes: [284, 320, 301, 167, 143, 90, 87, 68, 61, 51, 45, 36, 29, 27, 24, 14]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2134096622467041 seconds
Jaccard graph constructed in 0.47774243354797363 seconds
Wrote graph to binary file in 0.036612749099731445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891666
Louvain completed 21 runs in 1.3963208198547363 seconds
PhenoGraph complete in 2.136085271835327 seconds
Found communities [-1, ... 18], with sizes: [269, 349, 248, 150, 129, 89, 76, 70, 63, 61, 55, 36, 29, 25, 24, 19, 19, 13, 12, 11]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20694851875305176 seconds
Jaccard graph constructed in 0.4779665470123291 seconds
Wrote graph to binary file in 0.033194541931152344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.89278
Louvain completed 21 runs in 1.380201816558838 seconds
PhenoGraph complete in 2.10831618309021 seconds
Found communities [-1, ... 14], with sizes: [304, 320, 281, 139, 134, 91, 79, 75, 62, 55, 49, 35, 32, 32, 31, 28]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21299409866333008 seconds
Jaccard graph constructed in 0.4699113368988037 seconds
Wrote graph to binary file in 0.036470890045166016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890938
Louvain completed 21 runs in 1.3843612670898438 seconds
PhenoGraph complete in 2.114614963531494 seconds
Found communities [-1, ... 15], with sizes: [325, 353, 235, 160, 134, 92, 68, 68, 65, 54, 48, 34, 29, 25, 21, 20, 16]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21335363388061523 seconds
Jaccard graph constructed in 0.4796717166900635 seconds
Wrote graph to binary file in 0.033203840255737305 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890016
After 3 runs, maximum modularity is Q = 0.891483
Louvain completed 23 runs in 1.7361114025115967 seconds
PhenoGraph complete in 2.475517511367798 seconds
Found communities [-1, ... 17], with sizes: [293, 330, 181, 162, 129, 98, 92, 87, 67, 55, 54, 51, 33, 28, 23, 20, 19, 13, 12]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2094576358795166 seconds
Jaccard graph constructed in 0.4783966541290283 seconds
Wrote graph to binary file in 0.034241676330566406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891641
Louvain completed 21 runs in 1.3947453498840332 seconds
PhenoGraph complete in 2.1288998126983643 seconds
Found communities [-1, ... 15], with sizes: [345, 325, 259, 140, 128, 85, 70, 68, 67, 63, 48, 30, 29, 28, 24, 23, 15]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21103763580322266 seconds
Jaccard graph constructed in 0.4701263904571533 seconds
Wrote graph to binary file in 0.2873952388763428 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889394
After 8 runs, maximum modularity is Q = 0.890548
Louvain completed 28 runs in 2.0043752193450928 seconds
PhenoGraph complete in 2.982663631439209 seconds
Found communities [-1, ... 17], with sizes: [312, 303, 265, 157, 132, 97, 69, 62, 62, 54, 50, 32, 31, 24, 24, 24, 20, 17, 12]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21220898628234863 seconds
Jaccard graph constructed in 0.4945716857910156 seconds
Wrote graph to binary file in 0.033548831939697266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886742
After 5 runs, maximum modularity is Q = 0.887786
Louvain completed 25 runs in 1.8308155536651611 seconds
PhenoGraph complete in 2.5825717449188232 seconds
Found communities [-1, ... 15], with sizes: [296, 406, 224, 145, 131, 82, 77, 75, 69, 65, 54, 29, 26, 23, 19, 14, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21071267127990723 seconds
Jaccard graph constructed in 0.49286913871765137 seconds
Wrote graph to binary file in 0.033945322036743164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890355
Louvain completed 21 runs in 1.3779034614562988 seconds
PhenoGraph complete in 2.127120018005371 seconds
Found communities [-1, ... 14], with sizes: [304, 345, 274, 157, 135, 80, 79, 70, 64, 50, 49, 30, 29, 28, 28, 25]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2136213779449463 seconds
Jaccard graph constructed in 0.4851834774017334 seconds
Wrote graph to binary file in 0.03219246864318848 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888702
Louvain completed 21 runs in 1.4015047550201416 seconds
PhenoGraph complete in 2.143026828765869 seconds
Found communities [-1, ... 14], with sizes: [321, 381, 225, 154, 137, 74, 72, 65, 59, 55, 53, 33, 32, 32, 31, 23]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21358609199523926 seconds
Jaccard graph constructed in 0.47631025314331055 seconds
Wrote graph to binary file in 0.03299760818481445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893192
Louvain completed 21 runs in 1.3891570568084717 seconds
PhenoGraph complete in 2.123581886291504 seconds
Found communities [-1, ... 15], with sizes: [265, 341, 308, 165, 128, 97, 74, 71, 64, 49, 45, 30, 29, 26, 22, 21, 12]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2122657299041748 seconds
Jaccard graph constructed in 0.4299893379211426 seconds
Wrote graph to binary file in 0.04511857032775879 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88642
Louvain completed 21 runs in 1.3999876976013184 seconds
PhenoGraph complete in 2.127727746963501 seconds
Found communities [-1, ... 17], with sizes: [282, 355, 248, 129, 128, 100, 89, 80, 64, 54, 49, 30, 30, 28, 24, 18, 16, 12, 11]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21165895462036133 seconds
Jaccard graph constructed in 0.4974677562713623 seconds
Wrote graph to binary file in 0.0336461067199707 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891933
Louvain completed 21 runs in 1.4018588066101074 seconds
PhenoGraph complete in 2.156179904937744 seconds
Found communities [-1, ... 17], with sizes: [287, 309, 291, 160, 128, 89, 77, 72, 63, 58, 49, 32, 28, 24, 20, 18, 15, 14, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21214962005615234 seconds
Jaccard graph constructed in 0.47988247871398926 seconds
Wrote graph to binary file in 0.2532627582550049 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888439
Louvain completed 21 runs in 1.398589849472046 seconds
PhenoGraph complete in 2.357525587081909 seconds
Found communities [-1, ... 17], with sizes: [289, 371, 247, 132, 120, 101, 92, 87, 59, 48, 35, 32, 29, 25, 24, 18, 13, 13, 12]

In [236]:
sc.pp.normalize_per_cell(D337_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D337_Brus_Dis1) # log transform the data
D337_Brus_Dis1.raw = D337_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [237]:
D337_Brus_Dis1 = D337_Brus_Dis1[:, D337_Brus_Dis1.var['ribo_genes']]
D337_Brus_Dis1
Out[237]:
View of AnnData object with n_obs × n_vars = 1398 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [238]:
D339_Brus_Dis1 = sc.read_10x_mtx(
    './D339_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D339_Brus_Dis1.var_names_make_unique()
D339_Brus_Dis1.obs['manip'] = 'D339_Brus_Dis1'
D339_Brus_Dis1.obs['position'] = 'Distal'
D339_Brus_Dis1.obs['method'] = 'Brushing'
D339_Brus_Dis1.obs['donor'] = 'D339'
D339_Brus_Dis1.obs['name'] = ['D339_Brus_Dis1_' + s for s in list(D339_Brus_Dis1.obs.index)]
D339_Brus_Dis1.obs_names = D339_Brus_Dis1.obs['name']
D339_Brus_Dis1
... reading from cache file ./cache/D339_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[238]:
AnnData object with n_obs × n_vars = 1382 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [239]:
sc.pl.highest_expr_genes(D339_Brus_Dis1, n_top=20)
In [240]:
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=0)
mito_genes = D339_Brus_Dis1.var_names.str.startswith('MT-')
D339_Brus_Dis1.obs['percent_mito'] = np.sum(
    D339_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.obs['n_counts'] = D339_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D339_Brus_Dis1.to_df())
ribo_genes = D339_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D339_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D339_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D339_Brus_Dis1.X, axis=1).A1
D339_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D339_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [241]:
sc.pp.filter_cells(D339_Brus_Dis1, min_genes=500)
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['n_counts'] < 15000, :]
D339_Brus_Dis1 = D339_Brus_Dis1[D339_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 35 cells that have less than 500 genes expressed
In [242]:
# scrublet
scrub = scr.Scrublet(D339_Brus_Dis1.X, expected_doublet_rate=0.012)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D339_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D339_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.12
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 31.3%
Overall doublet rate:
	Expected   = 1.2%
	Estimated  = 1.4%
Elapsed time: 0.7 seconds
Out[242]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b3f7b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b73ba8>],
       dtype=object))
In [243]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D339_Brus_Dis1.X).predict()
D339_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11077642440795898 seconds
Jaccard graph constructed in 0.46425771713256836 seconds
Wrote graph to binary file in 0.029248476028442383 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885009
Louvain completed 21 runs in 1.3841643333435059 seconds
PhenoGraph complete in 2.0007967948913574 seconds
Found communities [-1, ... 21], with sizes: [160, 162, 153, 140, 111, 95, 91, 81, 77, 76, 76, 69, 69, 65, 53, 33, 32, 25, 23, 22, 21, 18, 16]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11488533020019531 seconds
Jaccard graph constructed in 0.41995882987976074 seconds
Wrote graph to binary file in 0.034644365310668945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888411
Louvain completed 21 runs in 1.3784821033477783 seconds
PhenoGraph complete in 1.9651494026184082 seconds
Found communities [-1, ... 20], with sizes: [160, 173, 149, 143, 98, 94, 87, 80, 78, 77, 77, 76, 71, 71, 47, 41, 36, 27, 25, 21, 20, 17]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.210890531539917 seconds
Jaccard graph constructed in 0.4322068691253662 seconds
Wrote graph to binary file in 0.042722463607788086 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883775
Louvain completed 21 runs in 1.4371147155761719 seconds
PhenoGraph complete in 2.1498568058013916 seconds
Found communities [-1, ... 21], with sizes: [117, 259, 167, 152, 84, 80, 79, 77, 77, 76, 75, 70, 63, 42, 42, 34, 33, 33, 24, 24, 24, 20, 16]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10892605781555176 seconds
Jaccard graph constructed in 0.40697622299194336 seconds
Wrote graph to binary file in 0.03625774383544922 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885898
After 9 runs, maximum modularity is Q = 0.887208
Louvain completed 29 runs in 2.031949520111084 seconds
PhenoGraph complete in 2.606541156768799 seconds
Found communities [-1, ... 21], with sizes: [175, 204, 167, 157, 107, 89, 85, 84, 70, 68, 65, 62, 47, 43, 42, 37, 33, 32, 27, 26, 20, 15, 13]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11081123352050781 seconds
Jaccard graph constructed in 0.4158029556274414 seconds
Wrote graph to binary file in 0.3278360366821289 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.889271
Louvain completed 21 runs in 1.393808126449585 seconds
PhenoGraph complete in 2.260817050933838 seconds
Found communities [-1, ... 21], with sizes: [152, 259, 181, 146, 99, 91, 79, 76, 71, 67, 63, 49, 47, 47, 46, 46, 37, 23, 22, 22, 16, 16, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11050081253051758 seconds
Jaccard graph constructed in 0.5362215042114258 seconds
Wrote graph to binary file in 0.044724464416503906 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883253
Louvain completed 21 runs in 1.5084295272827148 seconds
PhenoGraph complete in 2.21579909324646 seconds
Found communities [-1, ... 21], with sizes: [150, 250, 162, 124, 120, 105, 94, 84, 76, 75, 64, 62, 57, 51, 37, 34, 27, 21, 17, 17, 14, 14, 13]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10633039474487305 seconds
Jaccard graph constructed in 0.43742847442626953 seconds
Wrote graph to binary file in 0.05077648162841797 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883893
After 7 runs, maximum modularity is Q = 0.885032
Louvain completed 27 runs in 1.9285502433776855 seconds
PhenoGraph complete in 2.541843891143799 seconds
Found communities [-1, ... 20], with sizes: [130, 250, 162, 115, 113, 105, 90, 88, 86, 86, 78, 68, 66, 53, 45, 30, 21, 20, 18, 16, 15, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10640215873718262 seconds
Jaccard graph constructed in 0.4886972904205322 seconds
Wrote graph to binary file in 0.04035234451293945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885462
Louvain completed 21 runs in 1.389136552810669 seconds
PhenoGraph complete in 2.035393714904785 seconds
Found communities [-1, ... 23], with sizes: [140, 176, 152, 138, 115, 97, 94, 91, 77, 77, 76, 76, 72, 37, 37, 37, 30, 25, 22, 20, 17, 17, 16, 16, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11144781112670898 seconds
Jaccard graph constructed in 0.47895145416259766 seconds
Wrote graph to binary file in 0.03691458702087402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887619
After 3 runs, maximum modularity is Q = 0.888856
Louvain completed 23 runs in 1.7179548740386963 seconds
PhenoGraph complete in 2.3571958541870117 seconds
Found communities [-1, ... 21], with sizes: [133, 157, 134, 118, 104, 101, 94, 90, 89, 88, 86, 85, 85, 69, 56, 39, 29, 25, 22, 19, 15, 15, 15]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21216130256652832 seconds
Jaccard graph constructed in 0.4791691303253174 seconds
Wrote graph to binary file in 0.03528738021850586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888537
Louvain completed 21 runs in 1.3970263004302979 seconds
PhenoGraph complete in 2.1338586807250977 seconds
Found communities [-1, ... 21], with sizes: [161, 187, 161, 129, 114, 110, 97, 86, 82, 80, 74, 71, 54, 47, 35, 32, 32, 23, 21, 20, 20, 17, 15]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11372590065002441 seconds
Jaccard graph constructed in 0.5232908725738525 seconds
Wrote graph to binary file in 0.04384946823120117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882984
Louvain completed 21 runs in 1.6067731380462646 seconds
PhenoGraph complete in 2.300321578979492 seconds
Found communities [-1, ... 21], with sizes: [143, 175, 166, 154, 108, 97, 88, 86, 84, 81, 75, 72, 65, 59, 41, 40, 31, 21, 20, 18, 18, 13, 13]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1136023998260498 seconds
Jaccard graph constructed in 0.495316743850708 seconds
Wrote graph to binary file in 0.3014347553253174 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885818
Louvain completed 21 runs in 1.4269380569458008 seconds
PhenoGraph complete in 2.3467276096343994 seconds
Found communities [-1, ... 22], with sizes: [142, 184, 182, 147, 101, 92, 85, 84, 78, 77, 76, 75, 55, 51, 46, 33, 29, 24, 21, 21, 19, 17, 16, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11040973663330078 seconds
Jaccard graph constructed in 0.5481421947479248 seconds
Wrote graph to binary file in 0.04265451431274414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883822
Louvain completed 21 runs in 1.455559253692627 seconds
PhenoGraph complete in 2.169980525970459 seconds
Found communities [-1, ... 19], with sizes: [151, 179, 157, 138, 102, 99, 92, 92, 90, 88, 77, 69, 68, 59, 46, 42, 38, 27, 21, 17, 16]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11211466789245605 seconds
Jaccard graph constructed in 0.5048460960388184 seconds
Wrote graph to binary file in 0.03805422782897949 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884942
Louvain completed 21 runs in 1.3988964557647705 seconds
PhenoGraph complete in 2.074927806854248 seconds
Found communities [-1, ... 19], with sizes: [146, 164, 151, 110, 106, 105, 93, 91, 88, 84, 84, 79, 77, 74, 56, 42, 36, 23, 23, 22, 14]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10606861114501953 seconds
Jaccard graph constructed in 0.44935131072998047 seconds
Wrote graph to binary file in 0.05611062049865723 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882788
After 2 runs, maximum modularity is Q = 0.884078
Louvain completed 22 runs in 1.7772152423858643 seconds
PhenoGraph complete in 2.406134843826294 seconds
Found communities [-1, ... 21], with sizes: [127, 161, 130, 122, 110, 106, 94, 91, 90, 77, 76, 71, 68, 67, 63, 62, 36, 23, 22, 21, 20, 17, 14]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1152658462524414 seconds
Jaccard graph constructed in 0.535527229309082 seconds
Wrote graph to binary file in 0.04455399513244629 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.882038
After 2 runs, maximum modularity is Q = 0.883485
Louvain completed 22 runs in 1.9561593532562256 seconds
PhenoGraph complete in 2.664527654647827 seconds
Found communities [-1, ... 21], with sizes: [148, 173, 155, 141, 128, 120, 96, 82, 81, 81, 77, 72, 71, 52, 45, 30, 24, 19, 18, 15, 14, 13, 13]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10978841781616211 seconds
Jaccard graph constructed in 0.48043322563171387 seconds
Wrote graph to binary file in 0.04678463935852051 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890257
Louvain completed 21 runs in 1.4102518558502197 seconds
PhenoGraph complete in 2.0746712684631348 seconds
Found communities [-1, ... 22], with sizes: [161, 181, 149, 108, 107, 100, 94, 90, 84, 75, 73, 63, 60, 56, 51, 42, 41, 25, 24, 22, 18, 16, 15, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11337614059448242 seconds
Jaccard graph constructed in 0.4776909351348877 seconds
Wrote graph to binary file in 0.03323507308959961 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883034
After 4 runs, maximum modularity is Q = 0.884892
Louvain completed 24 runs in 1.7922523021697998 seconds
PhenoGraph complete in 2.4263017177581787 seconds
Found communities [-1, ... 19], with sizes: [140, 165, 129, 124, 108, 102, 98, 91, 90, 86, 84, 81, 79, 77, 75, 36, 27, 25, 18, 17, 16]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11046648025512695 seconds
Jaccard graph constructed in 0.5300097465515137 seconds
Wrote graph to binary file in 0.034813880920410156 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886437
After 3 runs, maximum modularity is Q = 0.887873
Louvain completed 23 runs in 1.7248668670654297 seconds
PhenoGraph complete in 2.4114723205566406 seconds
Found communities [-1, ... 21], with sizes: [174, 165, 154, 135, 104, 93, 87, 84, 84, 78, 76, 67, 64, 55, 46, 37, 36, 34, 26, 25, 18, 14, 12]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11335539817810059 seconds
Jaccard graph constructed in 0.4874453544616699 seconds
Wrote graph to binary file in 0.29427146911621094 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883389
After 2 runs, maximum modularity is Q = 0.884773
Louvain completed 22 runs in 1.6713039875030518 seconds
PhenoGraph complete in 2.5767195224761963 seconds
Found communities [-1, ... 21], with sizes: [119, 241, 170, 154, 98, 93, 83, 81, 74, 73, 68, 65, 62, 46, 43, 34, 31, 29, 27, 21, 20, 18, 18]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.1126549243927002 seconds
Jaccard graph constructed in 0.5239880084991455 seconds
Wrote graph to binary file in 0.03842663764953613 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.883117
After 2 runs, maximum modularity is Q = 0.885533
Louvain completed 22 runs in 1.967371940612793 seconds
PhenoGraph complete in 2.654338836669922 seconds
Found communities [-1, ... 18], with sizes: [159, 254, 162, 155, 125, 106, 97, 90, 83, 72, 72, 55, 47, 44, 30, 28, 27, 23, 22, 17]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.10812759399414062 seconds
Jaccard graph constructed in 0.4925107955932617 seconds
Wrote graph to binary file in 0.036156415939331055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.881369
After 3 runs, maximum modularity is Q = 0.88393
Louvain completed 23 runs in 1.7236568927764893 seconds
PhenoGraph complete in 2.3699145317077637 seconds
Found communities [-1, ... 21], with sizes: [135, 160, 159, 156, 103, 88, 88, 82, 80, 78, 77, 76, 70, 67, 47, 44, 32, 31, 22, 19, 19, 18, 17]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11336946487426758 seconds
Jaccard graph constructed in 0.47954440116882324 seconds
Wrote graph to binary file in 0.0342555046081543 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885208
After 4 runs, maximum modularity is Q = 0.886291
Louvain completed 24 runs in 1.7783267498016357 seconds
PhenoGraph complete in 2.415379285812378 seconds
Found communities [-1, ... 20], with sizes: [139, 266, 179, 97, 93, 92, 90, 82, 80, 74, 73, 68, 60, 50, 49, 43, 38, 25, 22, 18, 16, 14]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.11088943481445312 seconds
Jaccard graph constructed in 0.488300085067749 seconds
Wrote graph to binary file in 0.03625321388244629 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884459
After 4 runs, maximum modularity is Q = 0.885983
After 5 runs, maximum modularity is Q = 0.88705
Louvain completed 25 runs in 2.0681710243225098 seconds
PhenoGraph complete in 2.7141079902648926 seconds
Found communities [-1, ... 21], with sizes: [151, 248, 155, 111, 111, 89, 83, 76, 73, 68, 65, 63, 60, 56, 53, 44, 33, 32, 31, 23, 15, 15, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21431660652160645 seconds
Jaccard graph constructed in 0.5337605476379395 seconds
Wrote graph to binary file in 0.04564046859741211 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884847
Louvain completed 21 runs in 1.6112499237060547 seconds
PhenoGraph complete in 2.419363021850586 seconds
Found communities [-1, ... 21], with sizes: [117, 161, 140, 140, 102, 97, 88, 84, 81, 81, 79, 79, 76, 74, 70, 37, 36, 26, 26, 24, 20, 18, 12]

In [244]:
sc.pp.normalize_per_cell(D339_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D339_Brus_Dis1) # log transform the data
D339_Brus_Dis1.raw = D339_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [245]:
D339_Brus_Dis1 = D339_Brus_Dis1[:, D339_Brus_Dis1.var['ribo_genes']]
D339_Brus_Dis1
Out[245]:
View of AnnData object with n_obs × n_vars = 1335 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [246]:
D344_Brus_Dis1 = sc.read_10x_mtx(
    './D344_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D344_Brus_Dis1.var_names_make_unique()
D344_Brus_Dis1.obs['manip'] = 'D344_Brus_Dis1'
D344_Brus_Dis1.obs['position'] = 'Distal'
D344_Brus_Dis1.obs['method'] = 'Brushing'
D344_Brus_Dis1.obs['donor'] = 'D344'
D344_Brus_Dis1.obs['name'] = ['D344_Brus_Dis1_' + s for s in list(D344_Brus_Dis1.obs.index)]
D344_Brus_Dis1.obs_names = D344_Brus_Dis1.obs['name']
D344_Brus_Dis1
... reading from cache file ./cache/D344_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[246]:
AnnData object with n_obs × n_vars = 2817 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [247]:
sc.pl.highest_expr_genes(D344_Brus_Dis1, n_top=20)
In [248]:
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=0)
mito_genes = D344_Brus_Dis1.var_names.str.startswith('MT-')
D344_Brus_Dis1.obs['percent_mito'] = np.sum(
    D344_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.obs['n_counts'] = D344_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D344_Brus_Dis1.to_df())
ribo_genes = D344_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D344_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D344_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D344_Brus_Dis1.X, axis=1).A1
D344_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D344_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [249]:
sc.pp.filter_cells(D344_Brus_Dis1, min_genes=500)
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['n_counts'] < 30000, :]
D344_Brus_Dis1 = D344_Brus_Dis1[D344_Brus_Dis1.obs['percent_mito'] < 0.3 , :]
filtered out 8 cells that have less than 500 genes expressed
In [250]:
# scrublet
scrub = scr.Scrublet(D344_Brus_Dis1.X, expected_doublet_rate=0.023)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D344_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D344_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.17
Detected doublet rate = 1.4%
Estimated detectable doublet fraction = 42.4%
Overall doublet rate:
	Expected   = 2.3%
	Estimated  = 3.4%
Elapsed time: 2.5 seconds
Out[250]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea282b630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea1389ac8>],
       dtype=object))
In [251]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D344_Brus_Dis1.X).predict()
D344_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4087679386138916 seconds
Jaccard graph constructed in 0.676398515701294 seconds
Wrote graph to binary file in 0.06066608428955078 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908475
After 2 runs, maximum modularity is Q = 0.910893
Louvain completed 22 runs in 2.0190682411193848 seconds
PhenoGraph complete in 3.1778438091278076 seconds
Found communities [-1, ... 21], with sizes: [180, 658, 412, 412, 283, 264, 186, 150, 147, 114, 87, 81, 79, 79, 67, 54, 48, 44, 39, 33, 31, 26, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5136964321136475 seconds
Jaccard graph constructed in 0.6911904811859131 seconds
Wrote graph to binary file in 0.06196713447570801 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905674
Louvain completed 21 runs in 1.6694996356964111 seconds
PhenoGraph complete in 2.9525020122528076 seconds
Found communities [-1, ... 21], with sizes: [203, 654, 435, 407, 294, 235, 170, 140, 117, 116, 115, 80, 76, 73, 72, 64, 57, 47, 37, 35, 32, 16, 11]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.5086314678192139 seconds
Jaccard graph constructed in 0.6270010471343994 seconds
Wrote graph to binary file in 0.32866406440734863 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909567
Louvain completed 21 runs in 1.693134069442749 seconds
PhenoGraph complete in 3.1828861236572266 seconds
Found communities [-1, ... 20], with sizes: [157, 426, 424, 374, 314, 257, 223, 190, 173, 165, 118, 98, 94, 91, 85, 71, 56, 53, 41, 36, 21, 19]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30912256240844727 seconds
Jaccard graph constructed in 0.6146574020385742 seconds
Wrote graph to binary file in 0.0640110969543457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908697
Louvain completed 21 runs in 1.705700159072876 seconds
PhenoGraph complete in 2.7090213298797607 seconds
Found communities [-1, ... 21], with sizes: [139, 462, 425, 396, 252, 250, 213, 186, 148, 130, 130, 114, 76, 73, 72, 72, 71, 59, 55, 47, 46, 36, 34]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4136331081390381 seconds
Jaccard graph constructed in 0.6199691295623779 seconds
Wrote graph to binary file in 0.06575632095336914 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90764
After 2 runs, maximum modularity is Q = 0.909513
Louvain completed 22 runs in 2.0296332836151123 seconds
PhenoGraph complete in 3.145488739013672 seconds
Found communities [-1, ... 21], with sizes: [153, 560, 451, 392, 343, 298, 209, 130, 111, 102, 81, 80, 77, 75, 74, 71, 68, 66, 45, 35, 25, 20, 20]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30895423889160156 seconds
Jaccard graph constructed in 0.7187116146087646 seconds
Wrote graph to binary file in 0.30313754081726074 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909477
Louvain completed 21 runs in 1.780221939086914 seconds
PhenoGraph complete in 3.1288363933563232 seconds
Found communities [-1, ... 18], with sizes: [186, 642, 590, 286, 279, 222, 212, 205, 126, 111, 108, 86, 86, 73, 62, 60, 59, 37, 33, 23]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41129422187805176 seconds
Jaccard graph constructed in 0.6924159526824951 seconds
Wrote graph to binary file in 0.06309294700622559 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908917
Louvain completed 21 runs in 1.7447381019592285 seconds
PhenoGraph complete in 2.9282872676849365 seconds
Found communities [-1, ... 23], with sizes: [152, 664, 450, 371, 278, 228, 186, 144, 133, 112, 105, 94, 84, 78, 72, 63, 62, 51, 35, 35, 34, 19, 13, 12, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4090301990509033 seconds
Jaccard graph constructed in 0.6362454891204834 seconds
Wrote graph to binary file in 0.06520462036132812 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909113
Louvain completed 21 runs in 1.7199881076812744 seconds
PhenoGraph complete in 2.8456263542175293 seconds
Found communities [-1, ... 21], with sizes: [162, 542, 452, 399, 288, 267, 251, 210, 136, 110, 98, 91, 88, 81, 69, 64, 55, 34, 31, 21, 14, 12, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3094162940979004 seconds
Jaccard graph constructed in 0.6429347991943359 seconds
Wrote graph to binary file in 0.06117081642150879 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909833
Louvain completed 21 runs in 1.7047889232635498 seconds
PhenoGraph complete in 2.7326271533966064 seconds
Found communities [-1, ... 21], with sizes: [164, 479, 447, 398, 331, 298, 194, 151, 142, 140, 114, 96, 87, 86, 85, 63, 54, 46, 35, 29, 21, 15, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40952610969543457 seconds
Jaccard graph constructed in 0.6572070121765137 seconds
Wrote graph to binary file in 0.4156205654144287 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910152
After 13 runs, maximum modularity is Q = 0.911166
Louvain completed 33 runs in 2.755455493927002 seconds
PhenoGraph complete in 4.256148815155029 seconds
Found communities [-1, ... 22], with sizes: [193, 449, 415, 411, 292, 266, 209, 188, 129, 111, 106, 103, 94, 89, 73, 68, 65, 57, 51, 31, 31, 21, 18, 16]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4093315601348877 seconds
Jaccard graph constructed in 0.6507017612457275 seconds
Wrote graph to binary file in 0.06365847587585449 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910742
Louvain completed 21 runs in 1.6846542358398438 seconds
PhenoGraph complete in 2.825324296951294 seconds
Found communities [-1, ... 19], with sizes: [175, 591, 451, 425, 326, 291, 206, 119, 112, 110, 109, 75, 75, 73, 73, 69, 67, 57, 34, 33, 15]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30977797508239746 seconds
Jaccard graph constructed in 0.6322612762451172 seconds
Wrote graph to binary file in 0.06093192100524902 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91032
Louvain completed 21 runs in 1.7355291843414307 seconds
PhenoGraph complete in 2.752293586730957 seconds
Found communities [-1, ... 21], with sizes: [162, 440, 431, 420, 381, 274, 215, 162, 158, 129, 105, 94, 87, 71, 57, 55, 52, 48, 47, 36, 34, 15, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31106996536254883 seconds
Jaccard graph constructed in 0.7185852527618408 seconds
Wrote graph to binary file in 0.32609009742736816 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911352
After 12 runs, maximum modularity is Q = 0.912368
Louvain completed 32 runs in 2.7134311199188232 seconds
PhenoGraph complete in 4.0834503173828125 seconds
Found communities [-1, ... 22], with sizes: [192, 472, 411, 402, 356, 296, 205, 177, 112, 105, 91, 82, 80, 72, 68, 65, 60, 60, 35, 34, 32, 31, 24, 24]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40963220596313477 seconds
Jaccard graph constructed in 0.7162106037139893 seconds
Wrote graph to binary file in 0.06307792663574219 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905629
After 2 runs, maximum modularity is Q = 0.906655
Louvain completed 22 runs in 2.098421096801758 seconds
PhenoGraph complete in 3.3031489849090576 seconds
Found communities [-1, ... 19], with sizes: [185, 578, 432, 409, 301, 290, 279, 141, 117, 109, 101, 90, 79, 73, 73, 69, 51, 33, 32, 23, 21]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3199498653411865 seconds
Jaccard graph constructed in 0.6934974193572998 seconds
Wrote graph to binary file in 0.0614933967590332 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909638
Louvain completed 21 runs in 1.7370011806488037 seconds
PhenoGraph complete in 2.8248116970062256 seconds
Found communities [-1, ... 22], with sizes: [184, 626, 433, 398, 273, 198, 176, 169, 155, 116, 109, 81, 78, 76, 74, 66, 59, 48, 36, 34, 32, 23, 23, 19]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4096238613128662 seconds
Jaccard graph constructed in 0.7094638347625732 seconds
Wrote graph to binary file in 0.3516569137573242 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908799
Louvain completed 21 runs in 1.6793816089630127 seconds
PhenoGraph complete in 3.1653077602386475 seconds
Found communities [-1, ... 19], with sizes: [226, 633, 562, 275, 257, 233, 232, 196, 111, 110, 99, 93, 91, 84, 68, 57, 42, 35, 33, 27, 22]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30857133865356445 seconds
Jaccard graph constructed in 0.6490199565887451 seconds
Wrote graph to binary file in 0.0794672966003418 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909344
Louvain completed 21 runs in 1.7038657665252686 seconds
PhenoGraph complete in 2.763385057449341 seconds
Found communities [-1, ... 19], with sizes: [178, 659, 463, 394, 293, 233, 210, 179, 173, 104, 94, 78, 74, 74, 64, 52, 47, 45, 33, 28, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41222643852233887 seconds
Jaccard graph constructed in 0.6229174137115479 seconds
Wrote graph to binary file in 0.06175661087036133 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910231
Louvain completed 21 runs in 1.7293064594268799 seconds
PhenoGraph complete in 2.8399322032928467 seconds
Found communities [-1, ... 19], with sizes: [141, 624, 476, 443, 275, 212, 196, 194, 131, 115, 109, 84, 78, 67, 63, 62, 53, 52, 45, 33, 33]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3116340637207031 seconds
Jaccard graph constructed in 0.6231060028076172 seconds
Wrote graph to binary file in 0.2984890937805176 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906889
After 3 runs, maximum modularity is Q = 0.908115
After 21 runs, maximum modularity is Q = 0.909208
Louvain completed 41 runs in 3.6392505168914795 seconds
PhenoGraph complete in 4.895208835601807 seconds
Found communities [-1, ... 22], with sizes: [167, 534, 430, 366, 283, 277, 189, 160, 122, 121, 118, 109, 91, 78, 77, 68, 62, 61, 42, 41, 33, 23, 23, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.40950465202331543 seconds
Jaccard graph constructed in 0.6320044994354248 seconds
Wrote graph to binary file in 0.06148862838745117 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912087
Louvain completed 21 runs in 1.7090568542480469 seconds
PhenoGraph complete in 2.8252131938934326 seconds
Found communities [-1, ... 22], with sizes: [207, 426, 414, 369, 361, 265, 192, 188, 165, 119, 108, 85, 80, 68, 68, 68, 57, 52, 49, 34, 33, 33, 23, 22]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3101065158843994 seconds
Jaccard graph constructed in 0.7053666114807129 seconds
Wrote graph to binary file in 0.06212496757507324 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907665
After 5 runs, maximum modularity is Q = 0.908836
Louvain completed 25 runs in 2.2621588706970215 seconds
PhenoGraph complete in 3.356046438217163 seconds
Found communities [-1, ... 20], with sizes: [144, 649, 433, 409, 371, 252, 211, 142, 106, 100, 81, 77, 67, 63, 61, 60, 60, 44, 42, 41, 37, 36]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3102238178253174 seconds
Jaccard graph constructed in 0.7130467891693115 seconds
Wrote graph to binary file in 0.061510562896728516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908361
Louvain completed 21 runs in 1.685814619064331 seconds
PhenoGraph complete in 2.784780263900757 seconds
Found communities [-1, ... 22], with sizes: [141, 441, 423, 415, 409, 297, 262, 122, 113, 111, 105, 91, 81, 80, 71, 63, 53, 47, 46, 33, 32, 24, 15, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.414994478225708 seconds
Jaccard graph constructed in 0.9955241680145264 seconds
Wrote graph to binary file in 0.0627129077911377 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909846
Louvain completed 21 runs in 1.7044715881347656 seconds
PhenoGraph complete in 3.199652671813965 seconds
Found communities [-1, ... 20], with sizes: [167, 682, 429, 398, 290, 230, 228, 206, 109, 105, 89, 76, 72, 68, 64, 53, 51, 50, 35, 35, 25, 24]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3100249767303467 seconds
Jaccard graph constructed in 0.6987600326538086 seconds
Wrote graph to binary file in 0.0625908374786377 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911101
After 8 runs, maximum modularity is Q = 0.912188
Louvain completed 28 runs in 2.3771181106567383 seconds
PhenoGraph complete in 3.4672839641571045 seconds
Found communities [-1, ... 21], with sizes: [207, 414, 391, 384, 365, 299, 220, 175, 162, 120, 104, 84, 74, 72, 71, 69, 68, 54, 39, 33, 33, 24, 24]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3088405132293701 seconds
Jaccard graph constructed in 0.7303566932678223 seconds
Wrote graph to binary file in 0.061276912689208984 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908846
Louvain completed 21 runs in 1.6693217754364014 seconds
PhenoGraph complete in 2.788783073425293 seconds
Found communities [-1, ... 20], with sizes: [191, 475, 428, 382, 376, 280, 218, 193, 132, 115, 102, 100, 96, 75, 71, 67, 67, 34, 34, 22, 16, 12]

In [252]:
sc.pp.normalize_per_cell(D344_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D344_Brus_Dis1) # log transform the data
D344_Brus_Dis1.raw = D344_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [253]:
D344_Brus_Dis1 = D344_Brus_Dis1[:, D344_Brus_Dis1.var['ribo_genes']]
D344_Brus_Dis1
Out[253]:
View of AnnData object with n_obs × n_vars = 2789 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [254]:
D353_Brus_Dis1 = sc.read_10x_mtx(
    './D353_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D353_Brus_Dis1.var_names_make_unique()
D353_Brus_Dis1.obs['manip'] = 'D353_Brus_Dis1'
D353_Brus_Dis1.obs['position'] = 'Distal'
D353_Brus_Dis1.obs['method'] = 'Brushing'
D353_Brus_Dis1.obs['donor'] = 'D353'
D353_Brus_Dis1.obs['name'] = ['D353_Brus_Dis1_' + s for s in list(D353_Brus_Dis1.obs.index)]
D353_Brus_Dis1.obs_names = D353_Brus_Dis1.obs['name']
D353_Brus_Dis1
... reading from cache file ./cache/D353_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[254]:
AnnData object with n_obs × n_vars = 4787 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [255]:
sc.pl.highest_expr_genes(D353_Brus_Dis1, n_top=20)
In [256]:
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=0)
mito_genes = D353_Brus_Dis1.var_names.str.startswith('MT-')
D353_Brus_Dis1.obs['percent_mito'] = np.sum(
    D353_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.obs['n_counts'] = D353_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D353_Brus_Dis1.to_df())
ribo_genes = D353_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D353_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D353_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D353_Brus_Dis1.X, axis=1).A1
D353_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D353_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [257]:
sc.pp.filter_cells(D353_Brus_Dis1, min_genes=500)
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['n_counts'] < 20000, :]
D353_Brus_Dis1 = D353_Brus_Dis1[D353_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 200 cells that have less than 500 genes expressed
In [258]:
# scrublet
scrub = scr.Scrublet(D353_Brus_Dis1.X, expected_doublet_rate=0.039)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D353_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D353_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.19
Detected doublet rate = 2.4%
Estimated detectable doublet fraction = 50.7%
Overall doublet rate:
	Expected   = 3.9%
	Estimated  = 4.8%
Elapsed time: 3.7 seconds
Out[258]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea9ce0e48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e8828c048>],
       dtype=object))
In [259]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D353_Brus_Dis1.X).predict()
D353_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0162525177001953 seconds
Jaccard graph constructed in 0.8561174869537354 seconds
Wrote graph to binary file in 0.3788738250732422 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906007
After 7 runs, maximum modularity is Q = 0.907108
Louvain completed 27 runs in 2.9793484210968018 seconds
PhenoGraph complete in 5.250179290771484 seconds
Found communities [-1, ... 20], with sizes: [202, 1070, 828, 405, 384, 362, 349, 340, 335, 285, 193, 191, 163, 116, 107, 98, 87, 86, 55, 34, 22, 19]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.025327205657959 seconds
Jaccard graph constructed in 0.9080049991607666 seconds
Wrote graph to binary file in 0.10102081298828125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907931
Louvain completed 21 runs in 2.151581048965454 seconds
PhenoGraph complete in 4.204631567001343 seconds
Found communities [-1, ... 26], with sizes: [167, 1100, 521, 426, 357, 350, 320, 312, 300, 227, 195, 191, 187, 140, 124, 115, 111, 103, 101, 79, 77, 68, 54, 31, 26, 21, 17, 11]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9135782718658447 seconds
Jaccard graph constructed in 0.881655216217041 seconds
Wrote graph to binary file in 0.10137200355529785 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903933
After 2 runs, maximum modularity is Q = 0.906896
After 11 runs, maximum modularity is Q = 0.908669
Louvain completed 31 runs in 3.568588972091675 seconds
PhenoGraph complete in 5.4850242137908936 seconds
Found communities [-1, ... 21], with sizes: [195, 1020, 678, 636, 503, 475, 375, 234, 211, 192, 178, 171, 140, 114, 111, 105, 103, 98, 73, 52, 32, 24, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8154387474060059 seconds
Jaccard graph constructed in 1.1768968105316162 seconds
Wrote graph to binary file in 0.09696412086486816 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906034
Louvain completed 21 runs in 2.146071195602417 seconds
PhenoGraph complete in 4.2581493854522705 seconds
Found communities [-1, ... 23], with sizes: [187, 1133, 735, 429, 366, 362, 348, 288, 285, 216, 170, 168, 144, 131, 122, 97, 95, 93, 86, 83, 64, 62, 32, 18, 17]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8135569095611572 seconds
Jaccard graph constructed in 0.9032831192016602 seconds
Wrote graph to binary file in 0.370877742767334 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907711
Louvain completed 21 runs in 2.191746473312378 seconds
PhenoGraph complete in 4.298296928405762 seconds
Found communities [-1, ... 25], with sizes: [162, 954, 715, 632, 483, 370, 316, 247, 227, 212, 191, 189, 174, 118, 111, 109, 97, 97, 82, 62, 58, 36, 24, 23, 18, 12, 12]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7135679721832275 seconds
Jaccard graph constructed in 0.8894424438476562 seconds
Wrote graph to binary file in 0.09662818908691406 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909855
Louvain completed 21 runs in 2.1778337955474854 seconds
PhenoGraph complete in 3.897204875946045 seconds
Found communities [-1, ... 22], with sizes: [181, 1022, 729, 463, 443, 440, 409, 306, 288, 201, 176, 127, 121, 111, 111, 97, 96, 88, 81, 75, 70, 55, 22, 19]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9132382869720459 seconds
Jaccard graph constructed in 0.8571217060089111 seconds
Wrote graph to binary file in 0.09652829170227051 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910012
After 3 runs, maximum modularity is Q = 0.911375
Louvain completed 23 runs in 2.597601890563965 seconds
PhenoGraph complete in 4.483564138412476 seconds
Found communities [-1, ... 23], with sizes: [211, 1158, 546, 513, 499, 413, 290, 273, 256, 228, 179, 167, 126, 117, 115, 100, 98, 92, 88, 84, 59, 53, 23, 22, 21]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8131711483001709 seconds
Jaccard graph constructed in 1.1122410297393799 seconds
Wrote graph to binary file in 0.09736990928649902 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906841
After 3 runs, maximum modularity is Q = 0.908321
Louvain completed 23 runs in 2.661264657974243 seconds
PhenoGraph complete in 4.7019267082214355 seconds
Found communities [-1, ... 21], with sizes: [168, 1025, 639, 588, 451, 329, 298, 276, 264, 262, 227, 199, 152, 131, 117, 102, 102, 99, 84, 80, 62, 55, 21]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9124712944030762 seconds
Jaccard graph constructed in 0.8465819358825684 seconds
Wrote graph to binary file in 0.33472323417663574 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907178
After 3 runs, maximum modularity is Q = 0.908195
Louvain completed 23 runs in 2.7131783962249756 seconds
PhenoGraph complete in 4.826534748077393 seconds
Found communities [-1, ... 24], with sizes: [198, 1076, 780, 721, 405, 334, 241, 225, 220, 206, 188, 126, 125, 125, 106, 98, 86, 83, 82, 78, 61, 53, 40, 35, 21, 18]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8127193450927734 seconds
Jaccard graph constructed in 0.8970918655395508 seconds
Wrote graph to binary file in 0.09639120101928711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907681
After 2 runs, maximum modularity is Q = 0.908963
Louvain completed 22 runs in 2.432924270629883 seconds
PhenoGraph complete in 4.258150815963745 seconds
Found communities [-1, ... 25], with sizes: [187, 1105, 591, 578, 430, 400, 324, 289, 219, 218, 190, 177, 159, 133, 125, 108, 96, 81, 77, 63, 55, 29, 26, 21, 20, 16, 14]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0133864879608154 seconds
Jaccard graph constructed in 0.882857084274292 seconds
Wrote graph to binary file in 0.09488463401794434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906973
After 4 runs, maximum modularity is Q = 0.908767
Louvain completed 24 runs in 2.723487615585327 seconds
PhenoGraph complete in 4.733052968978882 seconds
Found communities [-1, ... 22], with sizes: [187, 1055, 543, 499, 454, 391, 345, 329, 271, 220, 194, 162, 157, 127, 120, 101, 97, 95, 93, 88, 77, 59, 45, 22]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8132424354553223 seconds
Jaccard graph constructed in 0.9040405750274658 seconds
Wrote graph to binary file in 0.3261268138885498 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906486
After 5 runs, maximum modularity is Q = 0.907882
Louvain completed 25 runs in 2.90863299369812 seconds
PhenoGraph complete in 4.972553730010986 seconds
Found communities [-1, ... 21], with sizes: [199, 1338, 666, 618, 542, 418, 257, 234, 210, 200, 145, 127, 114, 112, 94, 90, 88, 82, 76, 52, 32, 26, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7141628265380859 seconds
Jaccard graph constructed in 0.8912928104400635 seconds
Wrote graph to binary file in 0.09833002090454102 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909339
Louvain completed 21 runs in 2.091317892074585 seconds
PhenoGraph complete in 3.8150699138641357 seconds
Found communities [-1, ... 22], with sizes: [169, 1132, 590, 589, 457, 379, 340, 325, 218, 188, 148, 127, 126, 124, 116, 115, 115, 115, 99, 77, 73, 54, 32, 23]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.91365647315979 seconds
Jaccard graph constructed in 1.1987249851226807 seconds
Wrote graph to binary file in 0.09418463706970215 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906286
After 2 runs, maximum modularity is Q = 0.908241
Louvain completed 22 runs in 2.516464948654175 seconds
PhenoGraph complete in 4.742603778839111 seconds
Found communities [-1, ... 23], with sizes: [166, 1060, 662, 517, 381, 358, 326, 292, 291, 211, 197, 192, 164, 146, 135, 104, 102, 100, 79, 73, 61, 54, 24, 23, 13]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7124850749969482 seconds
Jaccard graph constructed in 0.8991072177886963 seconds
Wrote graph to binary file in 0.3332068920135498 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909318
Louvain completed 21 runs in 2.1733670234680176 seconds
PhenoGraph complete in 4.137807369232178 seconds
Found communities [-1, ... 24], with sizes: [167, 1343, 761, 417, 387, 382, 252, 251, 236, 190, 169, 135, 126, 123, 114, 112, 93, 86, 81, 80, 61, 55, 38, 36, 19, 17]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8131327629089355 seconds
Jaccard graph constructed in 0.8925216197967529 seconds
Wrote graph to binary file in 0.09775638580322266 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909605
After 5 runs, maximum modularity is Q = 0.910888
Louvain completed 25 runs in 2.690352201461792 seconds
PhenoGraph complete in 4.513716220855713 seconds
Found communities [-1, ... 25], with sizes: [191, 1167, 665, 431, 369, 359, 278, 272, 248, 222, 189, 180, 171, 119, 118, 114, 95, 88, 84, 83, 80, 73, 57, 23, 20, 19, 16]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.8124220371246338 seconds
Jaccard graph constructed in 0.8856863975524902 seconds
Wrote graph to binary file in 0.09579825401306152 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90632
After 4 runs, maximum modularity is Q = 0.907518
Louvain completed 24 runs in 2.6654577255249023 seconds
PhenoGraph complete in 4.477853536605835 seconds
Found communities [-1, ... 22], with sizes: [193, 1006, 831, 446, 421, 395, 370, 322, 267, 226, 189, 170, 123, 115, 106, 99, 94, 93, 70, 64, 58, 33, 20, 20]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.817241907119751 seconds
Jaccard graph constructed in 1.1534202098846436 seconds
Wrote graph to binary file in 0.09583544731140137 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905742
After 3 runs, maximum modularity is Q = 0.906999
Louvain completed 23 runs in 2.583540678024292 seconds
PhenoGraph complete in 4.670905590057373 seconds
Found communities [-1, ... 25], with sizes: [185, 969, 537, 475, 441, 389, 334, 330, 219, 203, 202, 167, 159, 148, 140, 126, 102, 96, 94, 89, 80, 67, 59, 41, 37, 23, 19]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9173688888549805 seconds
Jaccard graph constructed in 0.857006311416626 seconds
Wrote graph to binary file in 0.09522342681884766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907883
Louvain completed 21 runs in 2.2362606525421143 seconds
PhenoGraph complete in 4.1246726512908936 seconds
Found communities [-1, ... 20], with sizes: [214, 1014, 518, 517, 505, 503, 473, 316, 273, 220, 184, 156, 142, 112, 112, 107, 100, 92, 79, 54, 21, 19]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.714038610458374 seconds
Jaccard graph constructed in 1.1092274188995361 seconds
Wrote graph to binary file in 0.09666991233825684 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904754
After 4 runs, maximum modularity is Q = 0.906495
Louvain completed 24 runs in 2.6600112915039062 seconds
PhenoGraph complete in 4.597906589508057 seconds
Found communities [-1, ... 23], with sizes: [172, 1016, 666, 585, 525, 524, 439, 239, 227, 194, 127, 125, 124, 113, 101, 91, 79, 75, 73, 71, 54, 46, 31, 23, 11]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 1.0234904289245605 seconds
Jaccard graph constructed in 0.9439792633056641 seconds
Wrote graph to binary file in 0.10279488563537598 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908928
After 2 runs, maximum modularity is Q = 0.910043
Louvain completed 22 runs in 2.502131462097168 seconds
PhenoGraph complete in 4.601895093917847 seconds
Found communities [-1, ... 23], with sizes: [198, 1197, 566, 562, 384, 316, 312, 283, 280, 266, 195, 189, 147, 143, 116, 99, 95, 90, 81, 67, 54, 28, 22, 21, 20]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.818598747253418 seconds
Jaccard graph constructed in 1.184175968170166 seconds
Wrote graph to binary file in 0.0951235294342041 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905792
After 6 runs, maximum modularity is Q = 0.906813
Louvain completed 26 runs in 2.7821874618530273 seconds
PhenoGraph complete in 4.901823282241821 seconds
Found communities [-1, ... 24], with sizes: [165, 1066, 756, 553, 414, 341, 296, 294, 269, 172, 163, 160, 145, 133, 124, 99, 97, 92, 79, 77, 63, 54, 46, 40, 22, 11]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7158310413360596 seconds
Jaccard graph constructed in 0.9046008586883545 seconds
Wrote graph to binary file in 0.3708937168121338 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908037
Louvain completed 21 runs in 2.161968469619751 seconds
PhenoGraph complete in 4.17364501953125 seconds
Found communities [-1, ... 24], with sizes: [192, 1196, 658, 485, 350, 329, 296, 267, 257, 250, 195, 189, 171, 120, 114, 108, 104, 94, 83, 82, 68, 54, 19, 18, 18, 14]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.7137269973754883 seconds
Jaccard graph constructed in 0.8944852352142334 seconds
Wrote graph to binary file in 0.0983583927154541 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90756
After 4 runs, maximum modularity is Q = 0.909133
Louvain completed 24 runs in 2.8156487941741943 seconds
PhenoGraph complete in 4.542718410491943 seconds
Found communities [-1, ... 22], with sizes: [176, 1148, 812, 650, 334, 296, 281, 269, 260, 232, 199, 197, 136, 99, 93, 87, 86, 86, 81, 78, 54, 31, 28, 18]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.9139010906219482 seconds
Jaccard graph constructed in 0.8702192306518555 seconds
Wrote graph to binary file in 0.09519720077514648 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909525
Louvain completed 21 runs in 2.196678638458252 seconds
PhenoGraph complete in 4.094508171081543 seconds
Found communities [-1, ... 22], with sizes: [215, 1003, 797, 561, 436, 422, 369, 260, 242, 170, 169, 144, 131, 112, 107, 105, 101, 90, 81, 81, 56, 31, 28, 20]

In [260]:
sc.pp.normalize_per_cell(D353_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D353_Brus_Dis1) # log transform the data
D353_Brus_Dis1.raw = D353_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [261]:
D353_Brus_Dis1 = D353_Brus_Dis1[:, D353_Brus_Dis1.var['ribo_genes']]
D353_Brus_Dis1
Out[261]:
View of AnnData object with n_obs × n_vars = 4585 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [262]:
D354_Brus_Dis1 = sc.read_10x_mtx(
    './D354_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D354_Brus_Dis1.var_names_make_unique()
D354_Brus_Dis1.obs['manip'] = 'D354_Brus_Dis1'
D354_Brus_Dis1.obs['position'] = 'Distal'
D354_Brus_Dis1.obs['method'] = 'Brushing'
D354_Brus_Dis1.obs['donor'] = 'D354'
D354_Brus_Dis1.obs['name'] = ['D354_Brus_Dis1_' + s for s in list(D354_Brus_Dis1.obs.index)]
D354_Brus_Dis1.obs_names = D354_Brus_Dis1.obs['name']
D354_Brus_Dis1
... reading from cache file ./cache/D354_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[262]:
AnnData object with n_obs × n_vars = 2674 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [263]:
sc.pl.highest_expr_genes(D354_Brus_Dis1, n_top=20)
In [264]:
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=0)
mito_genes = D354_Brus_Dis1.var_names.str.startswith('MT-')
D354_Brus_Dis1.obs['percent_mito'] = np.sum(
    D354_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.obs['n_counts'] = D354_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D354_Brus_Dis1.to_df())
ribo_genes = D354_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D354_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D354_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D354_Brus_Dis1.X, axis=1).A1
D354_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D354_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [265]:
sc.pp.filter_cells(D354_Brus_Dis1, min_genes=500)
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['n_counts'] < 30000, :]
D354_Brus_Dis1 = D354_Brus_Dis1[D354_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 107 cells that have less than 500 genes expressed
In [266]:
# scrublet
scrub = scr.Scrublet(D354_Brus_Dis1.X, expected_doublet_rate=0.02)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D354_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D354_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.14
Detected doublet rate = 1.1%
Estimated detectable doublet fraction = 48.4%
Overall doublet rate:
	Expected   = 2.0%
	Estimated  = 2.3%
Elapsed time: 2.1 seconds
Out[266]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9dffff28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e887668>],
       dtype=object))
In [267]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D354_Brus_Dis1.X).predict()
D354_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3109779357910156 seconds
Jaccard graph constructed in 0.6154134273529053 seconds
Wrote graph to binary file in 0.05150198936462402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912896
Louvain completed 21 runs in 1.676947832107544 seconds
PhenoGraph complete in 2.6751487255096436 seconds
Found communities [-1, ... 19], with sizes: [276, 574, 336, 273, 258, 253, 249, 218, 146, 117, 94, 74, 65, 59, 46, 33, 32, 28, 27, 23, 12]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31023359298706055 seconds
Jaccard graph constructed in 0.6403079032897949 seconds
Wrote graph to binary file in 0.055225372314453125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912606
Louvain completed 21 runs in 1.6850180625915527 seconds
PhenoGraph complete in 2.7085413932800293 seconds
Found communities [-1, ... 20], with sizes: [247, 464, 368, 268, 250, 246, 244, 217, 158, 133, 129, 82, 82, 79, 69, 40, 28, 26, 18, 16, 15, 14]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41544604301452637 seconds
Jaccard graph constructed in 0.6332492828369141 seconds
Wrote graph to binary file in 0.3448221683502197 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911734
After 12 runs, maximum modularity is Q = 0.91298
Louvain completed 32 runs in 2.6464929580688477 seconds
PhenoGraph complete in 4.052936553955078 seconds
Found communities [-1, ... 23], with sizes: [227, 526, 382, 317, 279, 222, 211, 184, 166, 112, 83, 78, 71, 63, 54, 53, 37, 31, 26, 14, 12, 12, 11, 11, 11]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4133937358856201 seconds
Jaccard graph constructed in 0.6546018123626709 seconds
Wrote graph to binary file in 0.05997109413146973 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91339
Louvain completed 21 runs in 1.6897292137145996 seconds
PhenoGraph complete in 2.8313348293304443 seconds
Found communities [-1, ... 21], with sizes: [263, 470, 376, 300, 249, 232, 223, 219, 146, 119, 82, 78, 68, 66, 64, 43, 43, 36, 29, 28, 28, 20, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31443095207214355 seconds
Jaccard graph constructed in 0.6954479217529297 seconds
Wrote graph to binary file in 0.06105375289916992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911619
After 7 runs, maximum modularity is Q = 0.912628
Louvain completed 27 runs in 2.322207450866699 seconds
PhenoGraph complete in 3.4100241661071777 seconds
Found communities [-1, ... 21], with sizes: [273, 580, 357, 262, 255, 251, 215, 176, 156, 146, 84, 79, 74, 65, 51, 39, 30, 23, 21, 20, 13, 12, 11]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.308307409286499 seconds
Jaccard graph constructed in 0.6391499042510986 seconds
Wrote graph to binary file in 0.31018877029418945 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914066
Louvain completed 21 runs in 1.696565866470337 seconds
PhenoGraph complete in 2.983755350112915 seconds
Found communities [-1, ... 22], with sizes: [234, 571, 316, 269, 229, 223, 189, 182, 157, 142, 122, 85, 85, 73, 68, 59, 38, 35, 30, 29, 23, 12, 11, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41469764709472656 seconds
Jaccard graph constructed in 0.6524677276611328 seconds
Wrote graph to binary file in 0.06051373481750488 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914361
Louvain completed 21 runs in 1.6785728931427002 seconds
PhenoGraph complete in 2.823359727859497 seconds
Found communities [-1, ... 22], with sizes: [246, 445, 337, 288, 277, 226, 224, 217, 173, 140, 117, 97, 95, 66, 56, 41, 28, 26, 26, 22, 13, 11, 11, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31067657470703125 seconds
Jaccard graph constructed in 0.7079861164093018 seconds
Wrote graph to binary file in 0.060437917709350586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911151
After 3 runs, maximum modularity is Q = 0.912428
Louvain completed 23 runs in 2.0763797760009766 seconds
PhenoGraph complete in 3.1704320907592773 seconds
Found communities [-1, ... 21], with sizes: [244, 346, 346, 266, 263, 234, 231, 227, 219, 167, 136, 110, 87, 66, 55, 34, 31, 30, 28, 27, 21, 14, 11]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41648435592651367 seconds
Jaccard graph constructed in 0.6557509899139404 seconds
Wrote graph to binary file in 0.05987143516540527 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911838
After 2 runs, maximum modularity is Q = 0.913043
Louvain completed 22 runs in 2.020399332046509 seconds
PhenoGraph complete in 3.1707959175109863 seconds
Found communities [-1, ... 21], with sizes: [237, 463, 305, 300, 246, 219, 206, 203, 186, 171, 158, 92, 86, 69, 64, 39, 28, 27, 24, 23, 22, 13, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3099398612976074 seconds
Jaccard graph constructed in 0.638746976852417 seconds
Wrote graph to binary file in 0.3167448043823242 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911895
After 9 runs, maximum modularity is Q = 0.913302
Louvain completed 29 runs in 2.5141894817352295 seconds
PhenoGraph complete in 3.793105363845825 seconds
Found communities [-1, ... 20], with sizes: [253, 365, 346, 317, 262, 245, 217, 206, 201, 130, 120, 109, 107, 65, 64, 49, 35, 28, 27, 24, 12, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31298279762268066 seconds
Jaccard graph constructed in 0.6400563716888428 seconds
Wrote graph to binary file in 0.06019878387451172 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915879
Louvain completed 21 runs in 1.6584103107452393 seconds
PhenoGraph complete in 2.6892991065979004 seconds
Found communities [-1, ... 21], with sizes: [249, 435, 412, 312, 241, 216, 210, 210, 137, 117, 111, 99, 90, 82, 64, 46, 45, 30, 26, 22, 14, 13, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41120338439941406 seconds
Jaccard graph constructed in 0.6665787696838379 seconds
Wrote graph to binary file in 0.05998063087463379 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913767
Louvain completed 21 runs in 1.6954376697540283 seconds
PhenoGraph complete in 2.8483452796936035 seconds
Found communities [-1, ... 22], with sizes: [248, 571, 329, 302, 267, 251, 220, 177, 152, 114, 98, 86, 67, 66, 52, 36, 28, 27, 22, 21, 16, 15, 15, 13]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3105454444885254 seconds
Jaccard graph constructed in 0.6610550880432129 seconds
Wrote graph to binary file in 0.0586395263671875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910875
After 2 runs, maximum modularity is Q = 0.912356
After 11 runs, maximum modularity is Q = 0.913604
Louvain completed 31 runs in 2.885418176651001 seconds
PhenoGraph complete in 3.932236909866333 seconds
Found communities [-1, ... 21], with sizes: [258, 454, 359, 271, 265, 238, 212, 200, 160, 150, 142, 100, 78, 64, 64, 37, 28, 27, 23, 20, 18, 13, 12]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41141247749328613 seconds
Jaccard graph constructed in 0.7076559066772461 seconds
Wrote graph to binary file in 0.35921216011047363 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911854
Louvain completed 21 runs in 1.6987216472625732 seconds
PhenoGraph complete in 3.1918833255767822 seconds
Found communities [-1, ... 20], with sizes: [197, 594, 309, 262, 251, 216, 209, 192, 166, 159, 143, 98, 84, 82, 62, 39, 29, 29, 25, 22, 14, 11]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31340742111206055 seconds
Jaccard graph constructed in 0.6480762958526611 seconds
Wrote graph to binary file in 0.05925345420837402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910218
After 3 runs, maximum modularity is Q = 0.911512
Louvain completed 23 runs in 2.070939064025879 seconds
PhenoGraph complete in 3.105262279510498 seconds
Found communities [-1, ... 20], with sizes: [262, 493, 384, 375, 241, 221, 211, 203, 156, 133, 89, 78, 54, 48, 47, 40, 39, 28, 27, 26, 25, 13]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.318835973739624 seconds
Jaccard graph constructed in 0.6472687721252441 seconds
Wrote graph to binary file in 0.06056022644042969 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914123
Louvain completed 21 runs in 1.7077522277832031 seconds
PhenoGraph complete in 2.7500057220458984 seconds
Found communities [-1, ... 19], with sizes: [257, 502, 328, 313, 261, 226, 219, 214, 158, 131, 114, 90, 76, 71, 64, 43, 31, 29, 27, 23, 16]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.310788631439209 seconds
Jaccard graph constructed in 0.6427536010742188 seconds
Wrote graph to binary file in 0.05820202827453613 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911745
After 2 runs, maximum modularity is Q = 0.912848
Louvain completed 22 runs in 2.0115232467651367 seconds
PhenoGraph complete in 3.0371336936950684 seconds
Found communities [-1, ... 19], with sizes: [240, 574, 327, 271, 268, 254, 242, 218, 159, 154, 104, 90, 54, 49, 39, 37, 28, 28, 28, 16, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30944156646728516 seconds
Jaccard graph constructed in 0.6575002670288086 seconds
Wrote graph to binary file in 0.3323483467102051 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910348
After 13 runs, maximum modularity is Q = 0.91146
Louvain completed 33 runs in 2.7110776901245117 seconds
PhenoGraph complete in 4.027470111846924 seconds
Found communities [-1, ... 18], with sizes: [253, 592, 283, 268, 245, 238, 213, 193, 161, 147, 145, 114, 77, 67, 67, 36, 28, 25, 22, 19]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3081531524658203 seconds
Jaccard graph constructed in 0.6655983924865723 seconds
Wrote graph to binary file in 0.05713820457458496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911014
After 6 runs, maximum modularity is Q = 0.912216
Louvain completed 26 runs in 2.2825162410736084 seconds
PhenoGraph complete in 3.334857702255249 seconds
Found communities [-1, ... 21], with sizes: [231, 465, 308, 279, 220, 219, 211, 210, 146, 121, 120, 103, 98, 89, 79, 77, 65, 38, 29, 27, 26, 21, 11]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41150379180908203 seconds
Jaccard graph constructed in 0.6342306137084961 seconds
Wrote graph to binary file in 0.0579066276550293 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914207
Louvain completed 21 runs in 1.6732633113861084 seconds
PhenoGraph complete in 2.79417085647583 seconds
Found communities [-1, ... 21], with sizes: [246, 567, 318, 277, 276, 246, 241, 215, 150, 119, 93, 74, 71, 59, 47, 40, 31, 29, 23, 22, 21, 14, 14]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3146936893463135 seconds
Jaccard graph constructed in 0.6446681022644043 seconds
Wrote graph to binary file in 0.30008506774902344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913863
After 8 runs, maximum modularity is Q = 0.914904
Louvain completed 28 runs in 2.406613349914551 seconds
PhenoGraph complete in 3.6837995052337646 seconds
Found communities [-1, ... 22], with sizes: [241, 461, 329, 325, 307, 247, 223, 170, 159, 124, 88, 82, 71, 65, 64, 50, 35, 29, 27, 26, 26, 20, 13, 11]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.4151759147644043 seconds
Jaccard graph constructed in 0.6969523429870605 seconds
Wrote graph to binary file in 0.05820322036743164 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912801
After 5 runs, maximum modularity is Q = 0.913839
Louvain completed 25 runs in 2.2126829624176025 seconds
PhenoGraph complete in 3.3955702781677246 seconds
Found communities [-1, ... 19], with sizes: [270, 537, 301, 278, 242, 227, 219, 209, 191, 164, 140, 82, 77, 51, 39, 34, 33, 29, 29, 28, 13]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3191845417022705 seconds
Jaccard graph constructed in 0.6557085514068604 seconds
Wrote graph to binary file in 0.058357954025268555 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912105
Louvain completed 21 runs in 1.6967473030090332 seconds
PhenoGraph complete in 2.7497496604919434 seconds
Found communities [-1, ... 19], with sizes: [200, 432, 336, 331, 289, 238, 221, 212, 169, 154, 152, 96, 72, 71, 44, 44, 37, 30, 29, 24, 12]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.41039347648620605 seconds
Jaccard graph constructed in 0.6711676120758057 seconds
Wrote graph to binary file in 0.058712005615234375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910091
After 4 runs, maximum modularity is Q = 0.911357
Louvain completed 24 runs in 2.1444499492645264 seconds
PhenoGraph complete in 3.300370454788208 seconds
Found communities [-1, ... 21], with sizes: [247, 446, 298, 247, 236, 231, 216, 189, 174, 126, 119, 110, 100, 93, 74, 66, 59, 39, 33, 29, 27, 23, 11]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3092787265777588 seconds
Jaccard graph constructed in 0.6285579204559326 seconds
Wrote graph to binary file in 0.33060455322265625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911165
After 2 runs, maximum modularity is Q = 0.912317
Louvain completed 22 runs in 2.004624605178833 seconds
PhenoGraph complete in 3.286855459213257 seconds
Found communities [-1, ... 19], with sizes: [277, 458, 349, 311, 286, 265, 226, 202, 172, 143, 103, 87, 72, 49, 43, 37, 27, 26, 26, 21, 13]

In [268]:
sc.pp.normalize_per_cell(D354_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D354_Brus_Dis1) # log transform the data
D354_Brus_Dis1.raw = D354_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [269]:
D354_Brus_Dis1 = D354_Brus_Dis1[:, D354_Brus_Dis1.var['ribo_genes']]
D354_Brus_Dis1
Out[269]:
View of AnnData object with n_obs × n_vars = 2555 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [270]:
D363_Brus_Dis1 = sc.read_10x_mtx(
    './D363_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D363_Brus_Dis1.var_names_make_unique()
D363_Brus_Dis1.obs['manip'] = 'D363_Brus_Dis1'
D363_Brus_Dis1.obs['position'] = 'Distal'
D363_Brus_Dis1.obs['method'] = 'Brushing'
D363_Brus_Dis1.obs['donor'] = 'D363'
D363_Brus_Dis1.obs['name'] = ['D363_Brus_Dis1_' + s for s in list(D363_Brus_Dis1.obs.index)]
D363_Brus_Dis1.obs_names = D363_Brus_Dis1.obs['name']
D363_Brus_Dis1
... reading from cache file ./cache/D363_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[270]:
AnnData object with n_obs × n_vars = 1636 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [271]:
sc.pl.highest_expr_genes(D363_Brus_Dis1, n_top=20)
In [272]:
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=0)
mito_genes = D363_Brus_Dis1.var_names.str.startswith('MT-')
D363_Brus_Dis1.obs['percent_mito'] = np.sum(
    D363_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.obs['n_counts'] = D363_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D363_Brus_Dis1.to_df())
ribo_genes = D363_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D363_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D363_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D363_Brus_Dis1.X, axis=1).A1
D363_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D363_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [273]:
sc.pp.filter_cells(D363_Brus_Dis1, min_genes=500)
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['n_counts'] < 40000, :]
D363_Brus_Dis1 = D363_Brus_Dis1[D363_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 7 cells that have less than 500 genes expressed
In [274]:
# scrublet
scrub = scr.Scrublet(D363_Brus_Dis1.X, expected_doublet_rate=0.014)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D363_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D363_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.15
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 12.2%
Overall doublet rate:
	Expected   = 1.4%
	Estimated  = 2.5%
Elapsed time: 1.2 seconds
Out[274]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9f31f7f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e769b00>],
       dtype=object))
In [275]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D363_Brus_Dis1.X).predict()
D363_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3056020736694336 seconds
Jaccard graph constructed in 0.5146067142486572 seconds
Wrote graph to binary file in 0.03743934631347656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888374
Louvain completed 21 runs in 1.5843629837036133 seconds
PhenoGraph complete in 2.453190565109253 seconds
Found communities [-1, ... 16], with sizes: [114, 448, 325, 285, 195, 116, 69, 65, 56, 56, 56, 47, 45, 42, 38, 34, 25, 15]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3059718608856201 seconds
Jaccard graph constructed in 0.5110785961151123 seconds
Wrote graph to binary file in 0.03577017784118652 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88442
After 2 runs, maximum modularity is Q = 0.887818
Louvain completed 22 runs in 1.8797099590301514 seconds
PhenoGraph complete in 2.7443838119506836 seconds
Found communities [-1, ... 15], with sizes: [116, 427, 314, 293, 250, 121, 73, 62, 54, 53, 47, 45, 43, 38, 37, 33, 25]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30555295944213867 seconds
Jaccard graph constructed in 0.5366594791412354 seconds
Wrote graph to binary file in 0.03516554832458496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.880115
Louvain completed 21 runs in 1.5799152851104736 seconds
PhenoGraph complete in 2.471057176589966 seconds
Found communities [-1, ... 16], with sizes: [111, 390, 354, 345, 107, 105, 104, 73, 61, 59, 53, 50, 47, 44, 38, 33, 29, 28]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30599188804626465 seconds
Jaccard graph constructed in 0.5148305892944336 seconds
Wrote graph to binary file in 0.03429746627807617 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88795
Louvain completed 21 runs in 1.5485806465148926 seconds
PhenoGraph complete in 2.4131393432617188 seconds
Found communities [-1, ... 16], with sizes: [100, 445, 328, 277, 201, 105, 72, 64, 64, 58, 49, 49, 46, 43, 40, 33, 30, 27]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30594825744628906 seconds
Jaccard graph constructed in 0.505589485168457 seconds
Wrote graph to binary file in 0.29527902603149414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888183
Louvain completed 21 runs in 1.5424954891204834 seconds
PhenoGraph complete in 2.659623384475708 seconds
Found communities [-1, ... 16], with sizes: [90, 472, 330, 247, 186, 123, 77, 62, 60, 59, 58, 54, 53, 46, 36, 34, 25, 19]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20841121673583984 seconds
Jaccard graph constructed in 0.5117864608764648 seconds
Wrote graph to binary file in 0.08066749572753906 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886045
After 3 runs, maximum modularity is Q = 0.887464
Louvain completed 23 runs in 1.927800178527832 seconds
PhenoGraph complete in 2.7448880672454834 seconds
Found communities [-1, ... 17], with sizes: [92, 370, 297, 281, 184, 106, 98, 90, 66, 61, 61, 60, 56, 51, 44, 39, 34, 27, 14]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.307281494140625 seconds
Jaccard graph constructed in 0.5165157318115234 seconds
Wrote graph to binary file in 0.05749392509460449 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.891585
After 13 runs, maximum modularity is Q = 0.892615
Louvain completed 33 runs in 2.569979667663574 seconds
PhenoGraph complete in 3.4644887447357178 seconds
Found communities [-1, ... 16], with sizes: [106, 414, 360, 308, 165, 106, 68, 62, 60, 60, 59, 53, 44, 41, 34, 33, 32, 26]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2069225311279297 seconds
Jaccard graph constructed in 0.5162346363067627 seconds
Wrote graph to binary file in 0.09787106513977051 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888624
Louvain completed 21 runs in 1.5862059593200684 seconds
PhenoGraph complete in 2.4214892387390137 seconds
Found communities [-1, ... 14], with sizes: [131, 398, 328, 328, 166, 122, 104, 88, 69, 67, 48, 46, 43, 34, 33, 26]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30587148666381836 seconds
Jaccard graph constructed in 0.5218026638031006 seconds
Wrote graph to binary file in 0.06596827507019043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888201
After 2 runs, maximum modularity is Q = 0.890184
Louvain completed 22 runs in 1.8673889636993408 seconds
PhenoGraph complete in 2.777205228805542 seconds
Found communities [-1, ... 16], with sizes: [112, 391, 377, 281, 197, 108, 78, 67, 63, 51, 50, 48, 47, 42, 34, 30, 30, 25]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20612597465515137 seconds
Jaccard graph constructed in 0.5316848754882812 seconds
Wrote graph to binary file in 0.056473493576049805 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884153
After 3 runs, maximum modularity is Q = 0.885227
Louvain completed 23 runs in 1.9474444389343262 seconds
PhenoGraph complete in 2.774909496307373 seconds
Found communities [-1, ... 17], with sizes: [105, 385, 316, 226, 198, 115, 109, 82, 62, 60, 58, 56, 55, 47, 44, 34, 32, 28, 19]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31055498123168945 seconds
Jaccard graph constructed in 0.5277385711669922 seconds
Wrote graph to binary file in 0.3530879020690918 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888808
Louvain completed 21 runs in 1.559537649154663 seconds
PhenoGraph complete in 2.7639319896698 seconds
Found communities [-1, ... 17], with sizes: [95, 403, 337, 285, 219, 116, 97, 64, 64, 63, 49, 47, 45, 37, 35, 27, 19, 15, 14]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3062112331390381 seconds
Jaccard graph constructed in 0.500612735748291 seconds
Wrote graph to binary file in 0.0522458553314209 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887552
After 3 runs, maximum modularity is Q = 0.889903
Louvain completed 23 runs in 1.9731624126434326 seconds
PhenoGraph complete in 2.8495771884918213 seconds
Found communities [-1, ... 16], with sizes: [118, 368, 341, 293, 193, 124, 107, 75, 72, 59, 49, 47, 41, 37, 36, 34, 26, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30605316162109375 seconds
Jaccard graph constructed in 0.5170273780822754 seconds
Wrote graph to binary file in 0.06779837608337402 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.88737
After 3 runs, maximum modularity is Q = 0.889903
Louvain completed 23 runs in 1.9486722946166992 seconds
PhenoGraph complete in 2.8560667037963867 seconds
Found communities [-1, ... 16], with sizes: [106, 397, 308, 299, 221, 123, 71, 67, 67, 62, 50, 49, 48, 44, 40, 34, 26, 19]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20672941207885742 seconds
Jaccard graph constructed in 0.5143043994903564 seconds
Wrote graph to binary file in 0.07566356658935547 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886199
After 15 runs, maximum modularity is Q = 0.887373
Louvain completed 35 runs in 2.6414635181427 seconds
PhenoGraph complete in 3.4506618976593018 seconds
Found communities [-1, ... 16], with sizes: [116, 469, 308, 294, 153, 79, 72, 68, 68, 61, 53, 49, 44, 44, 44, 44, 35, 30]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20579004287719727 seconds
Jaccard graph constructed in 0.5308966636657715 seconds
Wrote graph to binary file in 0.0879676342010498 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.895374
Louvain completed 21 runs in 1.5688138008117676 seconds
PhenoGraph complete in 2.4071056842803955 seconds
Found communities [-1, ... 16], with sizes: [116, 373, 358, 313, 188, 104, 72, 72, 59, 58, 55, 54, 45, 42, 36, 34, 30, 22]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30623459815979004 seconds
Jaccard graph constructed in 0.5156998634338379 seconds
Wrote graph to binary file in 0.04895901679992676 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884455
Louvain completed 21 runs in 1.5934596061706543 seconds
PhenoGraph complete in 2.477506399154663 seconds
Found communities [-1, ... 16], with sizes: [104, 356, 337, 277, 198, 137, 108, 102, 66, 65, 45, 45, 45, 45, 35, 32, 23, 11]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20683526992797852 seconds
Jaccard graph constructed in 0.5088133811950684 seconds
Wrote graph to binary file in 0.3363635540008545 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.884463
After 2 runs, maximum modularity is Q = 0.888948
Louvain completed 22 runs in 1.9026503562927246 seconds
PhenoGraph complete in 2.968736171722412 seconds
Found communities [-1, ... 16], with sizes: [120, 396, 342, 299, 195, 105, 73, 66, 60, 56, 54, 46, 45, 42, 38, 36, 30, 28]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3061048984527588 seconds
Jaccard graph constructed in 0.5150551795959473 seconds
Wrote graph to binary file in 0.07030296325683594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.890766
After 6 runs, maximum modularity is Q = 0.891794
Louvain completed 26 runs in 2.142488956451416 seconds
PhenoGraph complete in 3.0472187995910645 seconds
Found communities [-1, ... 16], with sizes: [113, 444, 339, 242, 212, 109, 80, 70, 66, 56, 51, 47, 43, 39, 37, 35, 26, 22]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3075876235961914 seconds
Jaccard graph constructed in 0.5247697830200195 seconds
Wrote graph to binary file in 0.07669305801391602 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.886795
After 2 runs, maximum modularity is Q = 0.89007
Louvain completed 22 runs in 1.8691158294677734 seconds
PhenoGraph complete in 2.7894909381866455 seconds
Found communities [-1, ... 18], with sizes: [103, 310, 301, 202, 192, 124, 117, 105, 80, 69, 67, 59, 56, 45, 43, 42, 38, 36, 25, 17]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.31122303009033203 seconds
Jaccard graph constructed in 0.5190975666046143 seconds
Wrote graph to binary file in 0.09315967559814453 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885109
After 2 runs, maximum modularity is Q = 0.886114
Louvain completed 22 runs in 1.8706414699554443 seconds
PhenoGraph complete in 2.816376209259033 seconds
Found communities [-1, ... 16], with sizes: [98, 419, 365, 292, 164, 101, 70, 68, 65, 62, 61, 44, 43, 41, 39, 37, 32, 30]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3073849678039551 seconds
Jaccard graph constructed in 0.5167427062988281 seconds
Wrote graph to binary file in 0.0665273666381836 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.893247
After 17 runs, maximum modularity is Q = 0.894652
Louvain completed 37 runs in 2.7665717601776123 seconds
PhenoGraph complete in 3.6692793369293213 seconds
Found communities [-1, ... 17], with sizes: [106, 375, 373, 292, 169, 103, 79, 74, 66, 63, 52, 50, 45, 44, 35, 35, 29, 28, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30718994140625 seconds
Jaccard graph constructed in 0.5227699279785156 seconds
Wrote graph to binary file in 0.06388568878173828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887266
Louvain completed 21 runs in 1.5775952339172363 seconds
PhenoGraph complete in 2.4874119758605957 seconds
Found communities [-1, ... 15], with sizes: [97, 356, 351, 332, 187, 123, 109, 80, 69, 69, 49, 45, 44, 34, 30, 29, 27]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20546865463256836 seconds
Jaccard graph constructed in 0.5173823833465576 seconds
Wrote graph to binary file in 0.36354827880859375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.885597
After 2 runs, maximum modularity is Q = 0.888471
Louvain completed 22 runs in 1.8766109943389893 seconds
PhenoGraph complete in 2.982496738433838 seconds
Found communities [-1, ... 16], with sizes: [107, 415, 295, 289, 192, 124, 114, 64, 63, 59, 51, 48, 45, 43, 36, 30, 30, 26]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20569634437561035 seconds
Jaccard graph constructed in 0.5270733833312988 seconds
Wrote graph to binary file in 0.05930829048156738 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.887685
Louvain completed 21 runs in 1.551440715789795 seconds
PhenoGraph complete in 2.3558590412139893 seconds
Found communities [-1, ... 15], with sizes: [120, 405, 292, 197, 187, 149, 122, 99, 78, 63, 59, 58, 53, 45, 42, 36, 26]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3061830997467041 seconds
Jaccard graph constructed in 0.5073649883270264 seconds
Wrote graph to binary file in 0.09500670433044434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.888464
Louvain completed 21 runs in 1.556274652481079 seconds
PhenoGraph complete in 2.4863333702087402 seconds
Found communities [-1, ... 16], with sizes: [90, 393, 337, 297, 192, 114, 88, 70, 65, 57, 55, 53, 52, 46, 45, 35, 26, 16]

In [276]:
sc.pp.normalize_per_cell(D363_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D363_Brus_Dis1) # log transform the data
D363_Brus_Dis1.raw = D363_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [277]:
D363_Brus_Dis1 = D363_Brus_Dis1[:, D363_Brus_Dis1.var['ribo_genes']]
D363_Brus_Dis1
Out[277]:
View of AnnData object with n_obs × n_vars = 1625 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [278]:
D367_Brus_Dis1 = sc.read_10x_mtx(
    './D367_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D367_Brus_Dis1.var_names_make_unique()
D367_Brus_Dis1.obs['manip'] = 'D367_Brus_Dis1'
D367_Brus_Dis1.obs['position'] = 'Distal'
D367_Brus_Dis1.obs['method'] = 'Brushing'
D367_Brus_Dis1.obs['donor'] = 'D367'
D367_Brus_Dis1.obs['name'] = ['D367_Brus_Dis1_' + s for s in list(D367_Brus_Dis1.obs.index)]
D367_Brus_Dis1.obs_names = D367_Brus_Dis1.obs['name']
D367_Brus_Dis1
... reading from cache file ./cache/D367_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[278]:
AnnData object with n_obs × n_vars = 2192 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [279]:
sc.pl.highest_expr_genes(D367_Brus_Dis1, n_top=20)
In [280]:
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=0)
mito_genes = D367_Brus_Dis1.var_names.str.startswith('MT-')
D367_Brus_Dis1.obs['percent_mito'] = np.sum(
    D367_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.obs['n_counts'] = D367_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D367_Brus_Dis1.to_df())
ribo_genes = D367_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D367_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D367_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D367_Brus_Dis1.X, axis=1).A1
D367_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D367_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [281]:
sc.pp.filter_cells(D367_Brus_Dis1, min_genes=500)
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['n_counts'] < 25000, :]
D367_Brus_Dis1 = D367_Brus_Dis1[D367_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 63 cells that have less than 500 genes expressed
In [282]:
# scrublet
scrub = scr.Scrublet(D367_Brus_Dis1.X, expected_doublet_rate=0.018)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D367_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D367_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.20
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 34.2%
Overall doublet rate:
	Expected   = 1.8%
	Estimated  = 1.4%
Elapsed time: 1.6 seconds
Out[282]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1ea0b48e48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e966080>],
       dtype=object))
In [283]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D367_Brus_Dis1.X).predict()
D367_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30817437171936035 seconds
Jaccard graph constructed in 0.5569641590118408 seconds
Wrote graph to binary file in 0.05292320251464844 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.915315
Louvain completed 21 runs in 1.6288034915924072 seconds
PhenoGraph complete in 2.5775041580200195 seconds
Found communities [-1, ... 22], with sizes: [210, 389, 372, 142, 138, 129, 123, 121, 106, 96, 94, 90, 88, 86, 85, 78, 78, 62, 61, 30, 24, 23, 14, 11]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30838942527770996 seconds
Jaccard graph constructed in 0.6199069023132324 seconds
Wrote graph to binary file in 0.3374314308166504 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913035
Louvain completed 21 runs in 1.648174524307251 seconds
PhenoGraph complete in 2.926996946334839 seconds
Found communities [-1, ... 23], with sizes: [213, 374, 352, 188, 139, 132, 124, 123, 122, 117, 112, 91, 85, 82, 69, 60, 54, 51, 34, 33, 27, 27, 16, 13, 12]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30765342712402344 seconds
Jaccard graph constructed in 0.6194920539855957 seconds
Wrote graph to binary file in 0.04731869697570801 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913135
After 8 runs, maximum modularity is Q = 0.914167
Louvain completed 28 runs in 2.3672173023223877 seconds
PhenoGraph complete in 3.3543691635131836 seconds
Found communities [-1, ... 22], with sizes: [246, 368, 335, 169, 162, 138, 123, 123, 121, 119, 119, 96, 80, 75, 69, 64, 46, 44, 41, 29, 29, 22, 20, 12]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30759096145629883 seconds
Jaccard graph constructed in 0.6535928249359131 seconds
Wrote graph to binary file in 0.0500178337097168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910542
Louvain completed 21 runs in 1.6546294689178467 seconds
PhenoGraph complete in 2.683774709701538 seconds
Found communities [-1, ... 22], with sizes: [224, 438, 345, 222, 150, 135, 126, 124, 122, 94, 92, 82, 74, 61, 58, 53, 53, 50, 45, 35, 21, 18, 16, 12]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30747389793395996 seconds
Jaccard graph constructed in 0.632627010345459 seconds
Wrote graph to binary file in 0.04820871353149414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909638
After 4 runs, maximum modularity is Q = 0.910691
Louvain completed 24 runs in 2.1220600605010986 seconds
PhenoGraph complete in 3.1250510215759277 seconds
Found communities [-1, ... 22], with sizes: [206, 343, 330, 169, 154, 138, 123, 120, 119, 119, 117, 91, 84, 77, 77, 65, 64, 59, 57, 47, 29, 27, 22, 13]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3113377094268799 seconds
Jaccard graph constructed in 0.6209285259246826 seconds
Wrote graph to binary file in 0.3390324115753174 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911996
Louvain completed 21 runs in 1.6507577896118164 seconds
PhenoGraph complete in 2.938380002975464 seconds
Found communities [-1, ... 21], with sizes: [203, 407, 338, 154, 141, 133, 121, 121, 116, 106, 103, 92, 87, 86, 84, 84, 75, 58, 48, 32, 27, 23, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3106231689453125 seconds
Jaccard graph constructed in 0.6192572116851807 seconds
Wrote graph to binary file in 0.0482940673828125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.914419
After 3 runs, maximum modularity is Q = 0.91564
Louvain completed 23 runs in 2.0288586616516113 seconds
PhenoGraph complete in 3.0233969688415527 seconds
Found communities [-1, ... 22], with sizes: [229, 399, 383, 145, 139, 123, 121, 118, 99, 97, 94, 90, 82, 81, 79, 77, 73, 69, 50, 32, 24, 19, 16, 11]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30785131454467773 seconds
Jaccard graph constructed in 0.6327290534973145 seconds
Wrote graph to binary file in 0.047777652740478516 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911586
Louvain completed 21 runs in 1.631681203842163 seconds
PhenoGraph complete in 2.632725954055786 seconds
Found communities [-1, ... 20], with sizes: [220, 432, 326, 140, 134, 131, 130, 128, 126, 125, 125, 111, 105, 77, 70, 68, 67, 43, 28, 27, 24, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30881333351135254 seconds
Jaccard graph constructed in 0.6662120819091797 seconds
Wrote graph to binary file in 0.04795646667480469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.916893
Louvain completed 21 runs in 1.6270017623901367 seconds
PhenoGraph complete in 2.667825937271118 seconds
Found communities [-1, ... 21], with sizes: [229, 394, 347, 217, 170, 132, 130, 120, 116, 104, 93, 85, 74, 71, 68, 66, 61, 51, 37, 36, 23, 15, 11]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30783963203430176 seconds
Jaccard graph constructed in 0.6570472717285156 seconds
Wrote graph to binary file in 0.047704219818115234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.913946
Louvain completed 21 runs in 1.6545851230621338 seconds
PhenoGraph complete in 2.6834912300109863 seconds
Found communities [-1, ... 22], with sizes: [202, 387, 345, 146, 133, 131, 127, 119, 119, 103, 91, 90, 90, 83, 83, 81, 75, 65, 59, 31, 27, 25, 22, 16]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30813074111938477 seconds
Jaccard graph constructed in 0.6231639385223389 seconds
Wrote graph to binary file in 0.3321681022644043 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911367
After 3 runs, maximum modularity is Q = 0.912621
Louvain completed 23 runs in 2.0352542400360107 seconds
PhenoGraph complete in 3.3125600814819336 seconds
Found communities [-1, ... 20], with sizes: [218, 412, 390, 192, 136, 128, 125, 124, 120, 105, 91, 90, 85, 85, 80, 68, 63, 43, 32, 26, 22, 15]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30970287322998047 seconds
Jaccard graph constructed in 0.6314880847930908 seconds
Wrote graph to binary file in 0.04741311073303223 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909782
Louvain completed 21 runs in 1.6635627746582031 seconds
PhenoGraph complete in 2.6692705154418945 seconds
Found communities [-1, ... 21], with sizes: [197, 433, 267, 236, 138, 136, 121, 121, 121, 99, 97, 95, 94, 78, 76, 71, 70, 56, 52, 32, 27, 22, 11]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30899667739868164 seconds
Jaccard graph constructed in 0.6445717811584473 seconds
Wrote graph to binary file in 0.046227216720581055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.907196
Louvain completed 21 runs in 1.653536319732666 seconds
PhenoGraph complete in 2.6658196449279785 seconds
Found communities [-1, ... 20], with sizes: [216, 374, 358, 179, 170, 137, 132, 128, 127, 117, 115, 112, 85, 80, 68, 63, 50, 36, 35, 25, 24, 19]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2072758674621582 seconds
Jaccard graph constructed in 0.6239023208618164 seconds
Wrote graph to binary file in 0.04873156547546387 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911307
After 5 runs, maximum modularity is Q = 0.912324
Louvain completed 25 runs in 2.1757593154907227 seconds
PhenoGraph complete in 3.068892240524292 seconds
Found communities [-1, ... 22], with sizes: [248, 393, 372, 164, 139, 128, 126, 115, 114, 101, 98, 94, 77, 76, 66, 65, 61, 48, 41, 33, 30, 26, 23, 12]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3086273670196533 seconds
Jaccard graph constructed in 0.6524572372436523 seconds
Wrote graph to binary file in 0.04671001434326172 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911427
Louvain completed 21 runs in 1.7503738403320312 seconds
PhenoGraph complete in 2.7742316722869873 seconds
Found communities [-1, ... 21], with sizes: [246, 420, 396, 154, 135, 130, 126, 126, 119, 99, 95, 84, 83, 79, 69, 67, 62, 47, 30, 25, 23, 19, 16]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30905914306640625 seconds
Jaccard graph constructed in 0.6204829216003418 seconds
Wrote graph to binary file in 0.332430362701416 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909859
Louvain completed 21 runs in 1.6337625980377197 seconds
PhenoGraph complete in 2.914146900177002 seconds
Found communities [-1, ... 25], with sizes: [209, 397, 264, 227, 134, 131, 125, 124, 118, 105, 95, 88, 79, 76, 74, 72, 55, 55, 42, 34, 25, 25, 24, 20, 20, 17, 15]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3070664405822754 seconds
Jaccard graph constructed in 0.6194899082183838 seconds
Wrote graph to binary file in 0.045914411544799805 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910871
Louvain completed 21 runs in 1.6572914123535156 seconds
PhenoGraph complete in 2.646458864212036 seconds
Found communities [-1, ... 25], with sizes: [189, 390, 272, 148, 140, 118, 114, 113, 111, 110, 108, 97, 95, 91, 85, 81, 78, 59, 53, 40, 36, 27, 26, 20, 19, 17, 13]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30765438079833984 seconds
Jaccard graph constructed in 0.615267276763916 seconds
Wrote graph to binary file in 0.04660391807556152 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.908977
After 7 runs, maximum modularity is Q = 0.910094
Louvain completed 27 runs in 2.3271610736846924 seconds
PhenoGraph complete in 3.310150384902954 seconds
Found communities [-1, ... 22], with sizes: [203, 425, 358, 150, 131, 130, 120, 113, 111, 106, 94, 88, 85, 84, 79, 77, 70, 65, 39, 37, 29, 23, 18, 15]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30757808685302734 seconds
Jaccard graph constructed in 0.6173884868621826 seconds
Wrote graph to binary file in 0.04760885238647461 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912536
Louvain completed 21 runs in 1.622739315032959 seconds
PhenoGraph complete in 2.6111230850219727 seconds
Found communities [-1, ... 21], with sizes: [201, 354, 316, 145, 141, 133, 128, 126, 123, 121, 120, 120, 102, 91, 78, 78, 71, 48, 43, 35, 28, 24, 24]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30762290954589844 seconds
Jaccard graph constructed in 0.6224753856658936 seconds
Wrote graph to binary file in 0.04771590232849121 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912153
Louvain completed 21 runs in 1.6539711952209473 seconds
PhenoGraph complete in 2.6452627182006836 seconds
Found communities [-1, ... 21], with sizes: [206, 411, 370, 203, 157, 145, 123, 123, 106, 99, 93, 90, 83, 79, 73, 68, 66, 59, 23, 23, 21, 15, 14]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30814456939697266 seconds
Jaccard graph constructed in 0.6318809986114502 seconds
Wrote graph to binary file in 0.3320579528808594 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.91045
Louvain completed 21 runs in 1.6589550971984863 seconds
PhenoGraph complete in 2.944554328918457 seconds
Found communities [-1, ... 20], with sizes: [220, 415, 348, 142, 132, 130, 128, 121, 116, 113, 106, 104, 94, 88, 81, 67, 62, 54, 48, 40, 23, 18]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3077964782714844 seconds
Jaccard graph constructed in 0.6215500831604004 seconds
Wrote graph to binary file in 0.046036720275878906 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911735
After 2 runs, maximum modularity is Q = 0.912774
Louvain completed 22 runs in 1.9951071739196777 seconds
PhenoGraph complete in 2.9879136085510254 seconds
Found communities [-1, ... 22], with sizes: [223, 411, 330, 145, 140, 131, 129, 127, 124, 122, 121, 102, 101, 86, 85, 69, 45, 28, 28, 26, 25, 22, 18, 12]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30730152130126953 seconds
Jaccard graph constructed in 0.6207826137542725 seconds
Wrote graph to binary file in 0.04430031776428223 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.911311
Louvain completed 21 runs in 1.6737017631530762 seconds
PhenoGraph complete in 2.658686637878418 seconds
Found communities [-1, ... 22], with sizes: [236, 373, 298, 173, 147, 136, 136, 127, 127, 127, 103, 102, 88, 83, 73, 64, 64, 43, 38, 30, 25, 23, 18, 16]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30754995346069336 seconds
Jaccard graph constructed in 0.619056224822998 seconds
Wrote graph to binary file in 0.04679417610168457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.912466
Louvain completed 21 runs in 1.640221357345581 seconds
PhenoGraph complete in 2.6274075508117676 seconds
Found communities [-1, ... 21], with sizes: [224, 343, 323, 141, 133, 129, 126, 122, 119, 117, 112, 104, 90, 86, 84, 82, 81, 67, 52, 43, 31, 22, 19]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3131594657897949 seconds
Jaccard graph constructed in 0.6099262237548828 seconds
Wrote graph to binary file in 0.04532980918884277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910656
Louvain completed 21 runs in 1.6737513542175293 seconds
PhenoGraph complete in 2.654820680618286 seconds
Found communities [-1, ... 23], with sizes: [186, 371, 345, 140, 136, 128, 127, 123, 122, 118, 115, 104, 91, 91, 87, 79, 74, 49, 33, 33, 30, 21, 20, 16, 11]

In [284]:
sc.pp.normalize_per_cell(D367_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D367_Brus_Dis1) # log transform the data
D367_Brus_Dis1.raw = D367_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [285]:
D367_Brus_Dis1 = D367_Brus_Dis1[:, D367_Brus_Dis1.var['ribo_genes']]
D367_Brus_Dis1
Out[285]:
View of AnnData object with n_obs × n_vars = 2120 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

In [286]:
D372_Brus_Dis1 = sc.read_10x_mtx(
    './D372_Brus_Dis1/' + outsPath, 
    var_names='gene_symbols', 
    cache=True) 

D372_Brus_Dis1.var_names_make_unique()
D372_Brus_Dis1.obs['manip'] = 'D372_Brus_Dis1'
D372_Brus_Dis1.obs['position'] = 'Distal'
D372_Brus_Dis1.obs['method'] = 'Brushing'
D372_Brus_Dis1.obs['donor'] = 'D372'
D372_Brus_Dis1.obs['name'] = ['D372_Brus_Dis1_' + s for s in list(D372_Brus_Dis1.obs.index)]
D372_Brus_Dis1.obs_names = D372_Brus_Dis1.obs['name']
D372_Brus_Dis1
... reading from cache file ./cache/D372_Brus_Dis1-outs-filtered_gene_bc_matrices-ucagenomix-cellranger-hg19-1.3.0-matrix.h5ad
Out[286]:
AnnData object with n_obs × n_vars = 1755 × 32739 
    obs: 'manip', 'position', 'method', 'donor', 'name'
    var: 'gene_ids'
In [287]:
sc.pl.highest_expr_genes(D372_Brus_Dis1, n_top=20)
In [288]:
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=0)
mito_genes = D372_Brus_Dis1.var_names.str.startswith('MT-')
D372_Brus_Dis1.obs['percent_mito'] = np.sum(
    D372_Brus_Dis1[:, mito_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.obs['n_counts'] = D372_Brus_Dis1.X.sum(axis=1).A1
df_RB_depleted, counts_removed_per_cell, RB_genes_in_df = remove_RB_genes(D372_Brus_Dis1.to_df())
ribo_genes = D372_Brus_Dis1.to_df().columns.isin(RB_genes_in_df)
D372_Brus_Dis1.obs['percent_ribo'] = np.sum(
    D372_Brus_Dis1[:, ribo_genes].X, axis=1).A1 / np.sum(D372_Brus_Dis1.X, axis=1).A1
D372_Brus_Dis1.var['ribo_genes'] = [not i for i in ribo_genes]

sc.pl.violin(D372_Brus_Dis1, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'],
             jitter=0.4, multi_panel=True)
... storing 'manip' as categorical
... storing 'position' as categorical
... storing 'method' as categorical
... storing 'donor' as categorical
In [289]:
sc.pp.filter_cells(D372_Brus_Dis1, min_genes=500)
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['n_counts'] < 30000, :]
D372_Brus_Dis1 = D372_Brus_Dis1[D372_Brus_Dis1.obs['percent_mito'] < 0.5 , :]
filtered out 19 cells that have less than 500 genes expressed
In [290]:
# scrublet
scrub = scr.Scrublet(D372_Brus_Dis1.X, expected_doublet_rate=0.014)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
D372_Brus_Dis1.obs['doublet_scores'] = doublet_scores
D372_Brus_Dis1.obs['predicted_doublets'] = predicted_doublets
scrub.plot_histogram()
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.15
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 32.5%
Overall doublet rate:
	Expected   = 1.4%
	Estimated  = 0.9%
Elapsed time: 1.3 seconds
Out[290]:
(<Figure size 640x240 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e45b390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e9e296240>],
       dtype=object))
In [291]:
# doubletDetection
clf = doubletdetection.BoostClassifier()
labels = clf.fit(D372_Brus_Dis1.X).predict()
D372_Brus_Dis1.obs['doubletDetection'] = labels
/home/deprez/environments/env_HCA/lib/python3.7/site-packages/doubletdetection/doubletdetection.py:178: UserWarning: Sparse raw_counts is automatically densified.
  warnings.warn("Sparse raw_counts is automatically densified.")
Iteration   1/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20571494102478027 seconds
Jaccard graph constructed in 0.5139338970184326 seconds
Wrote graph to binary file in 0.04161214828491211 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.902631
Louvain completed 21 runs in 1.607027530670166 seconds
PhenoGraph complete in 2.402970790863037 seconds
Found communities [-1, ... 18], with sizes: [151, 517, 267, 180, 134, 120, 113, 95, 90, 79, 62, 57, 55, 47, 46, 44, 32, 21, 19, 14]

Iteration   2/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.3072233200073242 seconds
Jaccard graph constructed in 0.5509219169616699 seconds
Wrote graph to binary file in 0.03649640083312988 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906597
Louvain completed 21 runs in 1.6010956764221191 seconds
PhenoGraph complete in 2.507722854614258 seconds
Found communities [-1, ... 18], with sizes: [174, 538, 248, 180, 132, 130, 124, 89, 83, 73, 71, 52, 46, 45, 40, 36, 29, 21, 18, 14]

Iteration   3/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20583438873291016 seconds
Jaccard graph constructed in 0.524289608001709 seconds
Wrote graph to binary file in 0.07836651802062988 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.899191
After 3 runs, maximum modularity is Q = 0.900269
Louvain completed 23 runs in 1.9868910312652588 seconds
PhenoGraph complete in 2.8087825775146484 seconds
Found communities [-1, ... 19], with sizes: [189, 527, 210, 178, 177, 155, 118, 91, 85, 56, 54, 53, 46, 41, 40, 29, 25, 22, 20, 14, 13]

Iteration   4/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.30797839164733887 seconds
Jaccard graph constructed in 0.5188636779785156 seconds
Wrote graph to binary file in 0.042851924896240234 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905209
Louvain completed 21 runs in 1.6132774353027344 seconds
PhenoGraph complete in 2.5005009174346924 seconds
Found communities [-1, ... 18], with sizes: [126, 468, 313, 176, 158, 134, 123, 117, 111, 83, 56, 49, 48, 43, 42, 38, 19, 16, 12, 11]

Iteration   5/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20615077018737793 seconds
Jaccard graph constructed in 0.5189366340637207 seconds
Wrote graph to binary file in 0.38534021377563477 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905226
Louvain completed 21 runs in 1.6292517185211182 seconds
PhenoGraph complete in 2.751446008682251 seconds
Found communities [-1, ... 18], with sizes: [154, 466, 341, 180, 128, 127, 120, 93, 93, 82, 63, 56, 49, 41, 41, 28, 26, 22, 19, 14]

Iteration   6/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20828032493591309 seconds
Jaccard graph constructed in 0.619469165802002 seconds
Wrote graph to binary file in 0.041007041931152344 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904828
After 3 runs, maximum modularity is Q = 0.906637
Louvain completed 23 runs in 1.9833974838256836 seconds
PhenoGraph complete in 2.8635506629943848 seconds
Found communities [-1, ... 19], with sizes: [131, 430, 266, 183, 139, 117, 114, 111, 101, 86, 85, 59, 53, 46, 44, 44, 43, 38, 22, 20, 11]

Iteration   7/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2070465087890625 seconds
Jaccard graph constructed in 0.6272745132446289 seconds
Wrote graph to binary file in 0.03924131393432617 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904488
Louvain completed 21 runs in 1.5961191654205322 seconds
PhenoGraph complete in 2.481203556060791 seconds
Found communities [-1, ... 20], with sizes: [168, 374, 256, 192, 184, 139, 138, 110, 83, 77, 76, 57, 55, 43, 42, 36, 31, 21, 19, 15, 14, 13]

Iteration   8/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20821666717529297 seconds
Jaccard graph constructed in 0.6178300380706787 seconds
Wrote graph to binary file in 0.0373225212097168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90347
Louvain completed 21 runs in 1.6073861122131348 seconds
PhenoGraph complete in 2.482001543045044 seconds
Found communities [-1, ... 17], with sizes: [184, 470, 316, 174, 155, 136, 126, 112, 72, 62, 57, 46, 45, 42, 42, 35, 34, 22, 13]

Iteration   9/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.2064056396484375 seconds
Jaccard graph constructed in 0.5516624450683594 seconds
Wrote graph to binary file in 0.06741881370544434 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.910551
Louvain completed 21 runs in 1.5890874862670898 seconds
PhenoGraph complete in 2.4282758235931396 seconds
Found communities [-1, ... 17], with sizes: [179, 466, 301, 180, 139, 115, 109, 109, 103, 96, 55, 50, 49, 45, 43, 42, 31, 19, 12]

Iteration  10/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.21695446968078613 seconds
Jaccard graph constructed in 0.5578131675720215 seconds
Wrote graph to binary file in 0.07282543182373047 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904749
Louvain completed 21 runs in 1.602863073348999 seconds
PhenoGraph complete in 2.4634809494018555 seconds
Found communities [-1, ... 19], with sizes: [175, 504, 299, 179, 136, 126, 111, 106, 105, 70, 55, 54, 40, 35, 34, 33, 28, 17, 13, 12, 11]

Iteration  11/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20670032501220703 seconds
Jaccard graph constructed in 0.5516238212585449 seconds
Wrote graph to binary file in 0.051191091537475586 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904208
Louvain completed 21 runs in 1.6287384033203125 seconds
PhenoGraph complete in 2.4512691497802734 seconds
Found communities [-1, ... 19], with sizes: [178, 367, 283, 181, 145, 140, 136, 122, 113, 108, 72, 55, 45, 41, 40, 29, 28, 19, 15, 14, 12]

Iteration  12/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20838570594787598 seconds
Jaccard graph constructed in 0.6194431781768799 seconds
Wrote graph to binary file in 0.32436418533325195 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.901203
After 5 runs, maximum modularity is Q = 0.903511
Louvain completed 25 runs in 2.1236932277679443 seconds
PhenoGraph complete in 3.292933225631714 seconds
Found communities [-1, ... 18], with sizes: [176, 456, 323, 177, 171, 122, 112, 107, 98, 85, 55, 43, 42, 42, 37, 30, 19, 17, 17, 14]

Iteration  13/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20653152465820312 seconds
Jaccard graph constructed in 0.5965385437011719 seconds
Wrote graph to binary file in 0.04018712043762207 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904997
Louvain completed 21 runs in 1.5950124263763428 seconds
PhenoGraph complete in 2.456677198410034 seconds
Found communities [-1, ... 19], with sizes: [210, 506, 304, 176, 116, 109, 104, 97, 95, 86, 63, 55, 41, 38, 37, 31, 20, 19, 13, 12, 11]

Iteration  14/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20726919174194336 seconds
Jaccard graph constructed in 0.539621114730835 seconds
Wrote graph to binary file in 0.050312042236328125 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904782
After 5 runs, maximum modularity is Q = 0.905838
Louvain completed 25 runs in 2.1546876430511475 seconds
PhenoGraph complete in 2.965471029281616 seconds
Found communities [-1, ... 20], with sizes: [157, 496, 363, 177, 114, 110, 110, 97, 70, 67, 51, 43, 43, 42, 37, 33, 33, 32, 20, 18, 16, 14]

Iteration  15/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20701122283935547 seconds
Jaccard graph constructed in 0.5227954387664795 seconds
Wrote graph to binary file in 0.05220174789428711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.9051
After 2 runs, maximum modularity is Q = 0.90653
Louvain completed 22 runs in 1.9522175788879395 seconds
PhenoGraph complete in 2.766913890838623 seconds
Found communities [-1, ... 17], with sizes: [185, 465, 443, 184, 114, 99, 94, 83, 81, 67, 63, 54, 51, 40, 35, 28, 23, 22, 12]

Iteration  16/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20703864097595215 seconds
Jaccard graph constructed in 0.5422213077545166 seconds
Wrote graph to binary file in 0.05976581573486328 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.903313
Louvain completed 21 runs in 1.6176977157592773 seconds
PhenoGraph complete in 2.439768075942993 seconds
Found communities [-1, ... 18], with sizes: [236, 430, 325, 182, 118, 118, 111, 109, 100, 85, 54, 53, 43, 39, 31, 30, 24, 23, 18, 14]

Iteration  17/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20911455154418945 seconds
Jaccard graph constructed in 0.5380370616912842 seconds
Wrote graph to binary file in 0.0778660774230957 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904103
After 3 runs, maximum modularity is Q = 0.906238
Louvain completed 23 runs in 2.006376266479492 seconds
PhenoGraph complete in 2.848581314086914 seconds
Found communities [-1, ... 17], with sizes: [195, 419, 322, 176, 132, 118, 108, 107, 107, 101, 59, 57, 55, 49, 42, 35, 30, 20, 11]

Iteration  18/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20604419708251953 seconds
Jaccard graph constructed in 0.5328047275543213 seconds
Wrote graph to binary file in 0.36903810501098633 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906027
After 2 runs, maximum modularity is Q = 0.908249
Louvain completed 22 runs in 1.9443886280059814 seconds
PhenoGraph complete in 3.064648151397705 seconds
Found communities [-1, ... 19], with sizes: [181, 422, 339, 181, 127, 108, 100, 99, 87, 87, 83, 64, 59, 47, 42, 29, 29, 19, 15, 14, 11]

Iteration  19/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20657038688659668 seconds
Jaccard graph constructed in 0.608447790145874 seconds
Wrote graph to binary file in 0.037868499755859375 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.909134
Louvain completed 21 runs in 1.6105339527130127 seconds
PhenoGraph complete in 2.4751977920532227 seconds
Found communities [-1, ... 18], with sizes: [168, 495, 303, 181, 134, 114, 109, 100, 75, 72, 55, 53, 53, 48, 47, 42, 37, 25, 18, 14]

Iteration  20/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20666742324829102 seconds
Jaccard graph constructed in 0.5423116683959961 seconds
Wrote graph to binary file in 0.05187082290649414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.900571
After 8 runs, maximum modularity is Q = 0.901774
Louvain completed 28 runs in 2.328256130218506 seconds
PhenoGraph complete in 3.1406610012054443 seconds
Found communities [-1, ... 16], with sizes: [165, 467, 333, 177, 138, 127, 117, 108, 107, 99, 60, 56, 50, 38, 36, 31, 21, 13]

Iteration  21/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20690321922302246 seconds
Jaccard graph constructed in 0.5393500328063965 seconds
Wrote graph to binary file in 0.05608105659484863 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.904099
After 2 runs, maximum modularity is Q = 0.905887
After 9 runs, maximum modularity is Q = 0.906971
Louvain completed 29 runs in 2.634321689605713 seconds
PhenoGraph complete in 3.4601807594299316 seconds
Found communities [-1, ... 18], with sizes: [174, 484, 243, 176, 156, 148, 119, 88, 87, 72, 65, 59, 53, 51, 43, 39, 28, 23, 22, 13]

Iteration  22/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20659542083740234 seconds
Jaccard graph constructed in 0.5471713542938232 seconds
Wrote graph to binary file in 0.04985666275024414 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906413
Louvain completed 21 runs in 1.6360414028167725 seconds
PhenoGraph complete in 2.4550068378448486 seconds
Found communities [-1, ... 18], with sizes: [160, 471, 235, 231, 179, 114, 106, 95, 83, 80, 67, 57, 54, 43, 42, 37, 32, 27, 18, 12]

Iteration  23/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20646929740905762 seconds
Jaccard graph constructed in 0.5364046096801758 seconds
Wrote graph to binary file in 0.05254793167114258 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.90219
After 2 runs, maximum modularity is Q = 0.903544
Louvain completed 22 runs in 1.9394330978393555 seconds
PhenoGraph complete in 2.764660120010376 seconds
Found communities [-1, ... 19], with sizes: [159, 354, 326, 177, 138, 133, 122, 115, 109, 83, 81, 55, 54, 42, 35, 32, 32, 31, 25, 20, 20]

Iteration  24/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20595860481262207 seconds
Jaccard graph constructed in 0.5301704406738281 seconds
Wrote graph to binary file in 0.3261222839355469 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.905169
Louvain completed 21 runs in 1.58907151222229 seconds
PhenoGraph complete in 2.6655876636505127 seconds
Found communities [-1, ... 17], with sizes: [173, 519, 286, 178, 131, 122, 121, 111, 96, 84, 55, 55, 48, 45, 31, 30, 28, 17, 13]

Iteration  25/25

Creating synthetic doublets...
Normalizing...
Running PCA...
Clustering augmented data set with Phenograph...

Setting directed=False because prune=True
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 0.20603418350219727 seconds
Jaccard graph constructed in 0.5456793308258057 seconds
Wrote graph to binary file in 0.05715012550354004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.906646
Louvain completed 21 runs in 1.6018450260162354 seconds
PhenoGraph complete in 2.42425274848938 seconds
Found communities [-1, ... 17], with sizes: [155, 531, 272, 179, 130, 119, 118, 105, 92, 81, 70, 52, 52, 44, 41, 40, 25, 21, 16]

In [292]:
sc.pp.normalize_per_cell(D372_Brus_Dis1, counts_per_cell_after=1e4) # Normalize all data
sc.pp.log1p(D372_Brus_Dis1) # log transform the data
D372_Brus_Dis1.raw = D372_Brus_Dis1 # freeze the object (for later use of the raw state of it)
In [293]:
D372_Brus_Dis1 = D372_Brus_Dis1[:, D372_Brus_Dis1.var['ribo_genes']]
D372_Brus_Dis1
Out[293]:
View of AnnData object with n_obs × n_vars = 1715 × 32568 
    obs: 'manip', 'position', 'method', 'donor', 'name', 'n_genes', 'percent_mito', 'n_counts', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'doubletDetection'
    var: 'gene_ids', 'ribo_genes'

Concatenate full dataset

Back to top

In [294]:
adata = D322_Biop_Nas1.concatenate(D322_Biop_Pro1, D322_Biop_Int1, 
                                   D326_Biop_Pro1, D326_Biop_Int1, D326_Brus_Dis1,
                                   D337_Brus_Dis1,
                                   D339_Biop_Nas1, D339_Biop_Pro1, D339_Biop_Int1, D339_Brus_Dis1,
                                   D344_Biop_Nas1, D344_Biop_Pro1, D344_Biop_Int1, D344_Brus_Dis1,
                                   D345_Biop_Nas1,
                                   D353_Brus_Nas1, D353_Biop_Pro1, D353_Biop_Int2, D353_Brus_Dis1,
                                   D354_Biop_Pro1, D354_Biop_Int2, D354_Brus_Dis1,
                                   D363_Brus_Nas1, D363_Biop_Pro1, D363_Biop_Int2, D363_Brus_Dis1,
                                   D367_Brus_Nas1, D367_Biop_Pro1, D367_Biop_Int1, D367_Brus_Dis1,
                                   D372_Brus_Nas1, D372_Biop_Pro1, D372_Biop_Int1, D372_Biop_Int2, D372_Brus_Dis1,
                                   join='inner')
In [295]:
adata.write('/Data/Preprocessed_doublet_dataset.h5ad')
... storing 'donor' as categorical
... storing 'manip' as categorical
... storing 'method' as categorical
... storing 'position' as categorical
In [296]:
adata.obs.to_csv(path_or_buf = '/Data/metadata_doublet.tsv', 
               sep = '\t')
In [ ]: